.zip`. The `example_id` follows the same structure as the examples from the corresponding site in `config_files/`.
+
diff --git a/README_WA.md b/README_WA.md
new file mode 100644
index 0000000..2de728d
--- /dev/null
+++ b/README_WA.md
@@ -0,0 +1,105 @@
+# WebArena: A Realistic Web Environment for Building Autonomous Agents
+
+
+
+ WebArena is a standalone, self-hostable web environment for building autonomous agents
+
+
+
+
+
+
+
+
+
+
+
+
+Website •
+Paper •
+Leaderboard
+
+
+
+
+## Quick Walkthrough
+Check out [this script](minimal_example.py) for a quick walkthrough on how to set up the browser environment and interact with it using the demo sites we hosted. This script is only for education purpose, to perform *reproducible* experiments, please check out the next section. In the nutshell, using WebArena is very similar to using OpenAI Gym. The following code snippet shows how to interact with the environment.
+```python
+from browser_env import ScriptBrowserEnv, create_id_based_action
+# init the environment
+env = ScriptBrowserEnv(
+ headless=False,
+ observation_type="accessibility_tree",
+ current_viewport_only=True,
+ viewport_size={"width": 1280, "height": 720},
+)
+# prepare the environment for a configuration defined in a json file
+config_file = "config_files/0.json"
+obs, info = env.reset(options={"config_file": config_file})
+# get the text observation (e.g., html, accessibility tree) through obs["text"]
+
+# create a random action
+id = random.randint(0, 1000)
+action = create_id_based_action(f"click [id]")
+
+# take the action
+obs, _, terminated, _, info = env.step(action)
+```
+## WebArena Environment Setup
+1. Setup the standalone environment.
+Please check out [this page](environment_docker/README.md) for details.
+
+2. Export the `DATASET` to be `webarena`:
+```bash
+export DATASET=webarena
+```
+
+3. Configurate the urls for each website.
+```bash
+export SHOPPING=":7770"
+export SHOPPING_ADMIN=":7780/admin"
+export REDDIT=":9999"
+export GITLAB=":8023"
+export MAP=":3000"
+export WIKIPEDIA=":8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing"
+export HOMEPAGE=":4399" # this is a placeholder
+```
+
+> You are encouraged to update the environment variables in [github workflow](.github/workflows/tests.yml#L7) to ensure the correctness of unit tests
+
+4. Generate config file for each test example
+```bash
+python scripts/generate_test_data.py
+```
+You will see `*.json` files generated in [config_files](./wa/config_files) folder. Each file contains the configuration for one test example.
+
+## Develop Your Prompt-based Agent
+1. Define the prompts. We provide two baseline agents whose corresponding prompts are listed [here](./agent/prompts/raw). Each prompt is a dictionary with the following keys:
+```python
+prompt = {
+ "intro": ,
+ "examples": [
+ (
+ example_1_observation,
+ example_1_response
+ ),
+ (
+ example_2_observation,
+ example_2_response
+ ),
+ ...
+ ],
+ "template": ,
+ "meta_data": {
+ "observation": ,
+ "action_type": ,
+ "keywords": ,
+ "prompt_constructor": ,
+ "action_splitter":
+ }
+ }
+```
+
+2. Implement the prompt constructor. An example prompt constructor using Chain-of-thought/ReAct style reasoning is [here](./agent/prompts/prompt_constructor.py#L184). The prompt constructor is a class with the following methods:
+* `construct`: construct the input feed to an LLM
+* `_extract_action`: given the generation from an LLM, how to extract the phrase that corresponds to the action
diff --git a/browser_env/auto_login.py b/browser_env/auto_login.py
index 67a22c9..d1434dd 100644
--- a/browser_env/auto_login.py
+++ b/browser_env/auto_login.py
@@ -50,6 +50,7 @@
assert len(SITES) == len(URLS) == len(EXACT_MATCH) == len(KEYWORDS)
+
def is_expired(
storage_state: Path, url: str, keyword: str, url_exact: bool = True
) -> bool:
@@ -63,7 +64,7 @@ def is_expired(
context = browser.new_context(storage_state=storage_state)
page = context.new_page()
page.goto(url)
- time.sleep(1)
+ page.wait_for_timeout(1000)
d_url = page.url
content = page.content()
context_manager.__exit__()
diff --git a/config_files/wa/test_webarena_v2.raw.json b/config_files/wa/test_webarena_v2.raw.json
new file mode 100644
index 0000000..4bf1fdf
--- /dev/null
+++ b/config_files/wa/test_webarena_v2.raw.json
@@ -0,0 +1,29819 @@
+[
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 0,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the top-{{n}} best-selling product in {{year}}",
+ "instantiation_dict": {
+ "n": 1,
+ "year": 2022
+ },
+ "intent": "What is the top-1 best-selling product in 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Quest Lumaflex\u2122 Band"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Quest Lumaflex\u2122 Band",
+ "procedure_note": "Select the best seller report, config the period, from and to. Quest Lumaflex\u2122 Band has 5 order quantities."
+ },
+ "intent_template_id": 279
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 1,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the top-{{n}} best-selling brand in {{period}}",
+ "instantiation_dict": {
+ "n": 1,
+ "period": "Quarter 1 2022"
+ },
+ "intent": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Sprite"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Sprite",
+ "procedure_note": "Select the order report, config the period, from and to. The result will list all orders. There are 9 Sprite products ordered"
+ },
+ "intent_template_id": 279
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 3,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the products ranked as the top-{{n}} best-selling in {{year}}, including any ties at these ranks?",
+ "instantiation_dict": {
+ "n": 2,
+ "year": 2022
+ },
+ "intent": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 279,
+ "procedure_note": "Both order report, or best-seller report"
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 4,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the products ranked as the top-{{n}} best-selling in {{period}}, including any ties at this ranks?",
+ "instantiation_dict": {
+ "n": 1,
+ "period": "Jan 2023"
+ },
+ "intent": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 279
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 5,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the best-selling product category in {{period}}. My store sells top, bottom, gear and misc products.",
+ "instantiation_dict": {
+ "period": "Jan 2023"
+ },
+ "intent": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Top"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Top",
+ "procedure_note": "Get all orders for January 2023. Categorize the products into top, bottom, gear and misc. Count the number of products in each category. Top has 16 products ordered."
+ },
+ "intent_template_id": 279
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 6,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the products ranked as the top-{{n}} best-selling in {{period}}, including any ties at this ranks?",
+ "instantiation_dict": {
+ "n": 1,
+ "period": "2022 pride month"
+ },
+ "intent": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 279
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 7,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}",
+ "instantiation_dict": {
+ "airport_type": "international airports",
+ "start": "Carnegie Mellon University",
+ "radius": "50 km"
+ },
+ "intent": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States"
+ },
+ "intent_template_id": 79
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 8,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}",
+ "instantiation_dict": {
+ "airport_type": "international airports",
+ "start": "Carnegie Mellon University",
+ "radius": "5 km"
+ },
+ "intent": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no airport within 5 km of Carnegie Mellon University"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 79
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 9,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}",
+ "instantiation_dict": {
+ "airport_type": "international airports",
+ "start": "Carnegie Art Museum",
+ "radius": "30 km"
+ },
+ "intent": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Pittsburgh International Airport People Movers, Airport Boulevard, Findlay Township, Allegheny County, Pennsylvania, 15231, United States"
+ },
+ "intent_template_id": 79
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 10,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the full address of all {{airport_type}} that are within a driving distance of {{radius}} to {{start}}",
+ "instantiation_dict": {
+ "airport_type": "US international airports",
+ "start": "Niagara Falls",
+ "radius": "60 km"
+ },
+ "intent": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States Buffalo-Niagara International Airport, South Youngs Road, Town of Cheektowaga, Erie County, New York, 14221, United States"
+ },
+ "intent_template_id": 79
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 11,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the the number of reviews that our store received so far that mention term \"{{term}}\"",
+ "instantiation_dict": {
+ "term": "disappointed"
+ },
+ "intent": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "6"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "6"
+ },
+ "intent_template_id": 288
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 12,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the the number of reviews that our store received so far that mention term \"{{term}}\"",
+ "instantiation_dict": {
+ "term": "satisfied"
+ },
+ "intent": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2"
+ },
+ "intent_template_id": 288
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 13,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the the number of reviews that our store received so far that mention term \"{{term}}\"",
+ "instantiation_dict": {
+ "term": "decent"
+ },
+ "intent": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2"
+ },
+ "intent_template_id": 288
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 14,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the the number of reviews that our store received so far that mention term \"{{term}}\"",
+ "instantiation_dict": {
+ "term": "not useful"
+ },
+ "intent": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 288
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 15,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the the number of reviews that our store received so far that mention term \"{{term}}\"",
+ "instantiation_dict": {
+ "term": "best"
+ },
+ "intent": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2"
+ },
+ "intent_template_id": 288
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 16,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}",
+ "instantiation_dict": {
+ "start": "5000 Fifth Avenue, Pittsburgh",
+ "end": "UPMC family health center"
+ },
+ "intent": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "driving: 2min",
+ "walking: 16min"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Driving: 2min. Walking: 16min."
+ },
+ "intent_template_id": 73
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 17,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}",
+ "instantiation_dict": {
+ "start": "AMC Waterfront",
+ "end": "Carnegie Mellon University"
+ },
+ "intent": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "driving: 13min, walking: 1h 35min."
+ },
+ "intent_template_id": 73
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 18,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}",
+ "instantiation_dict": {
+ "start": "AMC Waterfront",
+ "end": "Univ of Pittsburgh"
+ },
+ "intent": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "driving: 15min, walking: 1h 47min."
+ },
+ "intent_template_id": 73
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 19,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Compare the time for walking and driving route from {{start}} to {{end}}",
+ "instantiation_dict": {
+ "start": "Carnegie Science Center",
+ "end": "Carnegie Mellon University"
+ },
+ "intent": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "driving: 12min, walking: 1h 44min."
+ },
+ "intent_template_id": 73
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 20,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Compare the difference in time for walking and driving route from {{start}} to {{end}}",
+ "instantiation_dict": {
+ "start": "Randyland",
+ "end": "Carnegie Mellon University"
+ },
+ "intent": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "driving: 13min, walking: 1h 45min."
+ },
+ "intent_template_id": 73
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 21,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/6s-wireless-headphones-over-ear-noise-canceling-hi-fi-bass-foldable-stereo-wireless-kid-headsets-earbuds-with-built-in-mic-micro-sd-tf-fm-for-iphone-samsung-ipad-pc-black-gold.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "ear cups being small"
+ },
+ "intent": "List out reviewers, if exist, who mention about ear cups being small",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, Michelle Davis"
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 22,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/fujifilm-finepix-z200fd-10mp-digital-camera-with-5x-optical-dual-image-stabilized-zoom-black.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "under water photo"
+ },
+ "intent": "List out reviewers, if exist, who mention about under water photo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no review that mention about under water photo for the given product"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 23,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/3-pack-samsung-galaxy-s6-screen-protector-nearpow-tempered-glass-screen-protector-with-9h-hardness-crystal-clear-easy-bubble-free-installation-scratch-resist.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "good fingerprint resistant"
+ },
+ "intent": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Rachel",
+ "T. Gannon"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Rachel, T. Gannon, "
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 24,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/haflinger-men-s-wool-felt-open-back-slippers-beige-550-peat-us-7.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "price being unfair"
+ },
+ "intent": "List out reviewers, if exist, who mention about price being unfair",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no reivew that mention about price being unfair for the given product"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 25,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "average print quality"
+ },
+ "intent": "List out reviewers, if exist, who mention about average print quality",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "GoldfishGoldfish, Roxanne Brandon Coffey"
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 26,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/epson-workforce-wf-3620-wifi-direct-all-in-one-color-inkjet-printer-copier-scanner-amazon-dash-replenishment-ready.html",
+ "geolocation": null,
+ "intent_template": "List out reviewers, if exist, who mention about {{description}}",
+ "instantiation_dict": {
+ "description": "complain of the customer service"
+ },
+ "intent": "List out reviewers, if exist, who mention about complain of the customer service",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Bob in Vegas",
+ "RemyR"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Bob in Vegas, RemyRRemyR"
+ },
+ "intent_template_id": 222
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 27,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+ "instantiation_dict": {
+ "forum": "Showerthoughts"
+ },
+ "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 33
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 28,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+ "instantiation_dict": {
+ "forum": "Worcester"
+ },
+ "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 33
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 29,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+ "instantiation_dict": {
+ "forum": "DIY"
+ },
+ "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 33
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 30,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+ "instantiation_dict": {
+ "forum": "space"
+ },
+ "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 33
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 31,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the {{forum}} forum.",
+ "instantiation_dict": {
+ "forum": "photoshopbattles"
+ },
+ "intent": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 33
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 32,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.",
+ "instantiation_dict": {
+ "place": "Pittsburgh Airport",
+ "information": "the walking distance",
+ "target1": "Hilton hotel",
+ "target2": "the nearest supermarket own by a local company"
+ },
+ "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 2.0km"
+ },
+ "intent_template_id": 78
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 33,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.",
+ "instantiation_dict": {
+ "place": "Pittsburgh Airport",
+ "target1": "Hilton hotel",
+ "information": "the shortest walking distance",
+ "target2": "a supermarket"
+ },
+ "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "DoubleTree by Hilton Hotel Pittsburgh Airport Distance: 1.4km"
+ },
+ "intent_template_id": 78
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 34,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.",
+ "instantiation_dict": {
+ "place": "Pittsburgh Airport",
+ "target1": "Hyatt hotel",
+ "information": "the shortest walking time",
+ "target2": "a supermarket"
+ },
+ "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport\n3:30"
+ },
+ "intent_template_id": 78
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 35,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I will arrive {{place}} soon. Provide the name of a {{target1}} in the vicinity, if available. Then, tell me the {{information}} to {{target2}} from the hotel.",
+ "instantiation_dict": {
+ "place": "Pittsburgh Airport",
+ "target1": "Hyatt hotel",
+ "information": "the minimal driving time",
+ "target2": "a supermarket"
+ },
+ "intent": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Time: 15min"
+ },
+ "intent_template_id": 78
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 36,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}",
+ "instantiation_dict": {
+ "place": "social security administration",
+ "location": "Carnegie Mellon University"
+ },
+ "intent": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Yes"
+ },
+ "intent_template_id": 77
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 37,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}",
+ "instantiation_dict": {
+ "place": "police station",
+ "location": "gates building at CMU"
+ },
+ "intent": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Yes"
+ },
+ "intent_template_id": 77
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 38,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}",
+ "instantiation_dict": {
+ "place": "duquesne university",
+ "location": "pittsburgh airport"
+ },
+ "intent": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Yes"
+ },
+ "intent_template_id": 77
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 39,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}",
+ "instantiation_dict": {
+ "place": "walmart",
+ "location": "5600 fifth avenue"
+ },
+ "intent": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Yes"
+ },
+ "intent_template_id": 77
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 40,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Check if the {{place}} in pittsburgh can be reached in one hour by car from {{location}}",
+ "instantiation_dict": {
+ "place": "amc theatre",
+ "location": "hobart street"
+ },
+ "intent": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Yes"
+ },
+ "intent_template_id": 77
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 41,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "List the top {{n}} most searched terms in my store",
+ "instantiation_dict": {
+ "n": "1"
+ },
+ "intent": "List the top 1 most searched terms in my store",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "hollister"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "hollister"
+ },
+ "intent_template_id": 285
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 42,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "List the top {{n}} most searched terms in my store",
+ "instantiation_dict": {
+ "n": "2"
+ },
+ "intent": "List the top 2 most searched terms in my store",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "hollister",
+ "Joust Bag"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "hollister, Joust Bag"
+ },
+ "intent_template_id": 285
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 43,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "List the top {{n}} most searched terms in my store",
+ "instantiation_dict": {
+ "n": "3"
+ },
+ "intent": "List the top 3 most searched terms in my store",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "hollister, Joust Bag, nike"
+ },
+ "intent_template_id": 285
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 44,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Check out my todos",
+ "instantiation_dict": {},
+ "intent": "Check out my todos",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/dashboard/todos",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 303
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 45,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "Check out the oldest closed issues",
+ "instantiation_dict": {},
+ "intent": "Check out the oldest closed issues",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?sort=created_asc&state=closed",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 300
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 46,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Check out the most recent open issues",
+ "instantiation_dict": {},
+ "intent": "Check out the most recent open issues",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/issues/?sort=created_date&state=opened",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 300
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 47,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.",
+ "instantiation_dict": {
+ "period": "over the past month"
+ },
+ "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the total spend is $0?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0 order, $0 total spend"
+ },
+ "intent_template_id": 197
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 48,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.",
+ "instantiation_dict": {
+ "period": "over the past three days"
+ },
+ "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the total spend is $0?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0 order, $0 total spend"
+ },
+ "intent_template_id": 197
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 49,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.",
+ "instantiation_dict": {
+ "period": "over the past four month"
+ },
+ "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "3 orders, $845.49 total spend"
+ },
+ "intent_template_id": 197
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 50,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.",
+ "instantiation_dict": {
+ "period": "over the past year"
+ },
+ "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "24 orders, $6560.69 total spend"
+ },
+ "intent_template_id": 197
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 51,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Today is 6/12/2023. Tell me how many fulfilled orders I have {{period}}, and the total amount of money I spent.",
+ "instantiation_dict": {
+ "period": "over the past six month"
+ },
+ "intent": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "12 orders, $1603.69 total spend"
+ },
+ "intent_template_id": 197
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 52,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How long does it take to walk from {{start}} to {{end}}?",
+ "instantiation_dict": {
+ "start": "Carnegie Mellon University",
+ "end": "starbucks on Craig Street"
+ },
+ "intent": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "7 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "7 min"
+ },
+ "intent_template_id": 68
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 53,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How long does it take to walk from {{start}} to {{end}}?",
+ "instantiation_dict": {
+ "start": "Univ of Pittsburgh",
+ "end": "starbucks on Craig Street"
+ },
+ "intent": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "18 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "18 min"
+ },
+ "intent_template_id": 68
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 54,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How long does it take to walk from {{start}} to {{end}}?",
+ "instantiation_dict": {
+ "start": "Carnegie Mellon University",
+ "end": "Univ of Pittsburgh"
+ },
+ "intent": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "25 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "25 min"
+ },
+ "intent_template_id": 68
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 55,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How long does it take to walk from {{start}} to {{end}}?",
+ "instantiation_dict": {
+ "start": "the starbuck near CMU",
+ "end": "Chatham university"
+ },
+ "intent": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "30 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "30 min"
+ },
+ "intent_template_id": 68
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 56,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How long does it take to walk from {{start}} to {{end}}?",
+ "instantiation_dict": {
+ "start": "Carnegie Museum of Art",
+ "end": "a library at CMU"
+ },
+ "intent": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "11 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "11 min"
+ },
+ "intent_template_id": 68
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 57,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}",
+ "instantiation_dict": {
+ "place1": "restaurant",
+ "place2": "university center at Carnegie Mellon University"
+ },
+ "intent": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, Au Bon Pain"
+ },
+ "intent_template_id": 69
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 58,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}",
+ "instantiation_dict": {
+ "place1": "cafe",
+ "place2": "CMU Hunt library"
+ },
+ "intent": "Tell me the closest cafe(s) to CMU Hunt library",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "De Fer Coffee & Tea"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "De Fer Coffee & Tea"
+ },
+ "intent_template_id": 69
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 59,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}",
+ "instantiation_dict": {
+ "place1": "restaurant",
+ "place2": "CMU Hunt library"
+ },
+ "intent": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The exchange"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "The exchange"
+ },
+ "intent_template_id": 69
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 60,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}",
+ "instantiation_dict": {
+ "place1": "restaurant",
+ "place2": "CMU Posner Hall"
+ },
+ "intent": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The exchange"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "The exchange"
+ },
+ "intent_template_id": 69
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 61,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the closest {{place1}}(s) to {{place2}}",
+ "instantiation_dict": {
+ "place1": "restaurant",
+ "place2": "CMU Sorrells Library"
+ },
+ "intent": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "La Prima Espresso"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "La Prima Espresso"
+ },
+ "intent_template_id": 69
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 62,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?",
+ "instantiation_dict": {
+ "quantifier": "most"
+ },
+ "intent": "Which customer has completed the most number of orders in the entire history?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Jane Smith"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Jane Smith"
+ },
+ "intent_template_id": 276
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 63,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Which customer(s) has completed the {{quantifier}} number of orders in the entire history?",
+ "instantiation_dict": {
+ "quantifier": "second most"
+ },
+ "intent": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Adam Garcia, Michael Nguyen, Sarah Miller"
+ },
+ "intent_template_id": 276
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 64,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Which customer has placed {{number}} orders in the entire history?",
+ "instantiation_dict": {
+ "number": "2"
+ },
+ "intent": "Which customer has placed 2 orders in the entire history?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lisa Kim, Lisa Green, Julia Williams, Brian Smith, Alexander Thomas"
+ },
+ "intent_template_id": 276
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 65,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Which customer has completed the {{quantifier}} number of orders in the entire history?",
+ "instantiation_dict": {
+ "quantifier": "fifth most"
+ },
+ "intent": "Which customer has completed the fifth most number of orders in the entire history?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Jane Doe",
+ "Matt Baker"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Jane Doe and Matt Baker, both placed 12 orders"
+ },
+ "intent_template_id": 276
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 66,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}",
+ "instantiation_dict": {
+ "number": 10,
+ "subreddit": "books",
+ "description": "show me the post URLs that recommand a single book"
+ },
+ "intent": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "__REDDIT__/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol, __REDDIT__/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ },
+ "intent_template_id": 17
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 67,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}",
+ "instantiation_dict": {
+ "number": 10,
+ "subreddit": "books",
+ "description": "show me the book names from posts that recommand a single book"
+ },
+ "intent": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "A Christmas Carol, The Hobbit"
+ },
+ "intent_template_id": 17
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 68,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}",
+ "instantiation_dict": {
+ "number": 10,
+ "subreddit": "books",
+ "description": "show me the author name and the book name from posts that recommand a single book"
+ },
+ "intent": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "answer": "yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "A Christmas Carol by Levar Burton: , The Hobbit by J. R. R. Tolkien"
+ },
+ "intent_template_id": 17
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 69,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Among the top {{number}} post in \"{{subreddit}}\" forum, {{description}}",
+ "instantiation_dict": {
+ "number": 10,
+ "subreddit": "books",
+ "description": "is there any post talks about supporting local book stores? If so, tell me the organizations involved"
+ },
+ "intent": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "bookshop.org"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "bookshop.org"
+ },
+ "intent_template_id": 17
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 70,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the zip code of {{place}}?",
+ "instantiation_dict": {
+ "place": "Carnegie Mellon University"
+ },
+ "intent": "What is the zip code of Carnegie Mellon University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "15213"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "15213"
+ },
+ "intent_template_id": 70
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 71,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the zip code of {{place}}?",
+ "instantiation_dict": {
+ "place": "Chatham University"
+ },
+ "intent": "What is the zip code of Chatham University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "15232"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "15232"
+ },
+ "intent_template_id": 70
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 72,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the zip code of {{place}}?",
+ "instantiation_dict": {
+ "place": "Yale University"
+ },
+ "intent": "What is the zip code of Yale University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "06516"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "06516"
+ },
+ "intent_template_id": 70
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 73,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the zip code of {{place}}?",
+ "instantiation_dict": {
+ "place": "Columbia University"
+ },
+ "intent": "What is the zip code of Columbia University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "10027"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "10027"
+ },
+ "intent_template_id": 70
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 74,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "instantiation_dict": {
+ "place_list": [
+ "Carnegie Mellon University",
+ "apple store shadyside",
+ "starbucks on craig street"
+ ]
+ },
+ "intent": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Carnegie Mellon University, starbucks on forbes ave, apple store shadyside"
+ },
+ "intent_template_id": 65
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 75,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "instantiation_dict": {
+ "place_list": [
+ "Massachusetts Institute of Technology",
+ "Harvard University",
+ "Boston Logan International Airport"
+ ]
+ },
+ "intent": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport"
+ },
+ "intent_template_id": 65
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 76,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Given the following locations, {{place_list}}, what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "instantiation_dict": {
+ "place_list": [
+ "Princeton University",
+ "Yale University",
+ "Harvard University"
+ ]
+ },
+ "intent": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The order is Princeton University, Yale University, Harvard University"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Princeton University, Yale University, Harvard University"
+ },
+ "intent_template_id": 65
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 77,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?",
+ "instantiation_dict": {
+ "status": "Pending"
+ },
+ "intent": "What is the total count of Pending reviews amongst all the reviews?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "5"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "5"
+ },
+ "intent_template_id": 277
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 78,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?",
+ "instantiation_dict": {
+ "status": "Approved"
+ },
+ "intent": "What is the total count of Approved reviews amongst all the reviews?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "346"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "346"
+ },
+ "intent_template_id": 277
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 79,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What is the total count of {{status}} reviews amongst all the reviews?",
+ "instantiation_dict": {
+ "status": "Not Approved"
+ },
+ "intent": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 277
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 80,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?",
+ "instantiation_dict": {
+ "place_A": "Carnegie Mellon University",
+ "place_B": "Starbucks on Craig Street",
+ "place_C": "Pittsburgh International Airport"
+ },
+ "intent": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "38 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "38 min"
+ },
+ "intent_template_id": 72
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 81,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?",
+ "instantiation_dict": {
+ "place_A": "Univ of Pittsburgh",
+ "place_B": "starbucks on Craig Street",
+ "place_C": "Pittsburgh International Airport"
+ },
+ "intent": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "49 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "49 min"
+ },
+ "intent_template_id": 72
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 82,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?",
+ "instantiation_dict": {
+ "place_A": "Massachusetts Institute of Technology",
+ "place_B": "Harvard University",
+ "place_C": "Boston Logan International Airport"
+ },
+ "intent": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "63 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "63 min"
+ },
+ "intent_template_id": 72
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 83,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the duration required to first walk from {{place_A}} to {{place_B}}, and then drive to {{place_C}}?",
+ "instantiation_dict": {
+ "place_A": "Carnegie Mellon University",
+ "place_B": "apple store shadyside",
+ "place_C": "starbucks on craig street"
+ },
+ "intent": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "22 min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "22 min"
+ },
+ "intent_template_id": 72
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 84,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?",
+ "instantiation_dict": {
+ "hotel": "DoubleTree by Hilton New York Downtown",
+ "place": "Keens Steakhouse"
+ },
+ "intent": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "14 minutes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "14 minutes"
+ },
+ "intent_template_id": 64
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 85,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?",
+ "instantiation_dict": {
+ "hotel": "La Quinta Inn near the airport",
+ "place": "Carnegie Mellon University"
+ },
+ "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "30 minutes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "30 minutes"
+ },
+ "intent_template_id": 64
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 86,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?",
+ "instantiation_dict": {
+ "hotel": "La Quinta Inn near the airport",
+ "place": "Upitt"
+ },
+ "intent": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "29 minutes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "29 minutes"
+ },
+ "intent_template_id": 64
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 87,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?",
+ "instantiation_dict": {
+ "hotel": "red roof inn",
+ "place": "Pittsburgh science museum"
+ },
+ "intent": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "20 minutes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "20 minutes"
+ },
+ "intent_template_id": 64
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 88,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "From my stay at {{hotel}}, what's the estimated driving time to reach {{place}}?",
+ "instantiation_dict": {
+ "hotel": "Homewood Suites Southpointe",
+ "place": "PPG Paints Arena"
+ },
+ "intent": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "34 minutes"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "34 minutes"
+ },
+ "intent_template_id": 64
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 89,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Which US states border {{state}}?",
+ "instantiation_dict": {
+ "state": "Connecticut"
+ },
+ "intent": "Which US states border Connecticut?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Rhode Island, Massachusetts, New York"
+ },
+ "intent_template_id": 67
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 90,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Which US states border {{state}}?",
+ "instantiation_dict": {
+ "state": "Pennsylvania"
+ },
+ "intent": "Which US states border Pennsylvania?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Ohio, Maryland, New York, New Jersey, Delaware, West Virginia"
+ },
+ "intent_template_id": 67
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 91,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Which US states border {{state}}?",
+ "instantiation_dict": {
+ "state": "Massachusetts"
+ },
+ "intent": "Which US states border Massachusetts?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Rhode Island, Connecticut, New York, New Hampshire, Vermont"
+ },
+ "intent_template_id": 67
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 92,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Which US states border {{state}}?",
+ "instantiation_dict": {
+ "state": "Vermont"
+ },
+ "intent": "Which US states border Vermont?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "New York, New Hampshire, Massachusetts"
+ },
+ "intent_template_id": 67
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 93,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Which US states border {{state}}?",
+ "instantiation_dict": {
+ "state": "New Hampshire"
+ },
+ "intent": "Which US states border New Hampshire?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Massachusetts, Vermont, Maine"
+ },
+ "intent_template_id": 67
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 94,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the grand total of invoice {{id}}.",
+ "instantiation_dict": {
+ "id": "000000001"
+ },
+ "intent": "Tell me the grand total of invoice 000000001.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "36.39"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$36.39"
+ },
+ "intent_template_id": 274
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 95,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the grand total of invoice {{id}}.",
+ "instantiation_dict": {
+ "id": "000000002"
+ },
+ "intent": "Tell me the grand total of invoice 000000002.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "39.64"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$39.64"
+ },
+ "intent_template_id": 274
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 96,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the status of my latest order and when will it arrive",
+ "instantiation_dict": {},
+ "intent": "Tell me the status of my latest order and when will it arrive",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "The last order was canceled. It will never arrive."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "reference_answer_raw_annotation": "The last order was canceled. It will never arrive.",
+ "string_note": ""
+ },
+ "intent_template_id": 193
+ },
+ {
+ "sites": [
+ "map",
+ "wikipedia"
+ ],
+ "task_id": 97,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "instantiation_dict": {},
+ "intent": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "914km"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "914 km"
+ },
+ "intent_template_id": 120
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 98,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?",
+ "instantiation_dict": {
+ "places": "tea cafe",
+ "start": "University of Pittsburgh"
+ },
+ "intent": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?"
+ },
+ {
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Fuku Tea, 3716, Forbes Avenue, Oakland, Central Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n653m"
+ },
+ "intent_template_id": 66
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 99,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?",
+ "instantiation_dict": {
+ "places": "Five Guys",
+ "start": "5700 Penn Ave"
+ },
+ "intent": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Five Guys, 117, South Bouquet Street, Oakland, North Oakland, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n4.0km"
+ },
+ "intent_template_id": 66
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 100,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?",
+ "instantiation_dict": {
+ "places": "Starbucks",
+ "start": "Carnegie Mellon"
+ },
+ "intent": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Starbucks, 417, South Craig Street, Bellefield, Pittsburgh, Allegheny County, Pennsylvania, 15213, United States\n557m"
+ },
+ "intent_template_id": 66
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 101,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{places}} to {{start}}, and what is the walking distance to it?",
+ "instantiation_dict": {
+ "places": "In-N-Out",
+ "start": "Upitts"
+ },
+ "intent": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no In-N-Out near University of Pittsburgh"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 66
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 102,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}",
+ "instantiation_dict": {
+ "label": "help needed",
+ "repo": "a11yproject/a11yproject.com"
+ },
+ "intent": "Display the list of issues in the a11yproject/a11yproject.com repository that have labels related to help needed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/?label_name%5B%5D=help%20wanted",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 349
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 103,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}",
+ "instantiation_dict": {
+ "label": "questions",
+ "repo": "kkroening/ffmpeg-python"
+ },
+ "intent": "Display the list of issues in the kkroening/ffmpeg-python repository that have labels related to questions",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/kkroening/ffmpeg-python/-/issues/?label_name%5B%5D=question",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 349
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 104,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}",
+ "instantiation_dict": {
+ "label": "flaky-test",
+ "repo": "keycloak/keycloak"
+ },
+ "intent": "Display the list of issues in the keycloak/keycloak repository that have labels related to flaky-test",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/keycloak/keycloak/-/issues/?label_name%5B%5D=flaky-test",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 349
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 105,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}",
+ "instantiation_dict": {
+ "label": "OpenAPI Generator CLI",
+ "repo": "OpenAPITools/openapi-generator"
+ },
+ "intent": "Display the list of issues in the OpenAPITools/openapi-generator repository that have labels related to OpenAPI Generator CLI",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/OpenAPITools/openapi-generator/-/issues/?label_name%5B%5D=OpenAPI%20Generator%20CLI",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 349
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 106,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Display the list of issues in the {{repo}} repository that have labels related to {{label}}",
+ "instantiation_dict": {
+ "label": "BUG",
+ "repo": "umano/AndroidSlidingUpPanel"
+ },
+ "intent": "Display the list of issues in the umano/AndroidSlidingUpPanel repository that have labels related to BUG",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/umano/AndroidSlidingUpPanel/-/issues/?label_name%5B%5D=BUG",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 349
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 107,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Presents the monthly count of successful orders {{period}} in MOUNT:COUNT format",
+ "instantiation_dict": {
+ "period": "from May to December 2022"
+ },
+ "intent": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
+ },
+ "intent_template_id": 270
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 108,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Presents the monthly count of successful orders {{period}} in MOUNT:COUNT format",
+ "instantiation_dict": {
+ "period": "01/2023-05/2023"
+ },
+ "intent": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "January: 12 orders Febulary: 7 orders March: 5 orders Apirl: 9 orders May: 5 orders"
+ },
+ "intent_template_id": 270
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 109,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Presents the monthly count of successful orders {{period}} in MOUNT:COUNT format",
+ "instantiation_dict": {
+ "period": "from Jan to December 2022"
+ },
+ "intent": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders December: 10 orders "
+ },
+ "intent_template_id": 270
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 110,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Presents the monthly count of successful orders {{period}} in MOUNT:COUNT format",
+ "instantiation_dict": {
+ "period": "from Jan to Nov 2022"
+ },
+ "intent": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "January: 11 orders Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders "
+ },
+ "intent_template_id": 270
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 111,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Presents the monthly count of successful orders {{period}} in MOUNT:COUNT format",
+ "instantiation_dict": {
+ "period": "from Feb to Nov 2022"
+ },
+ "intent": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Feburary: 16 orders March: 14 orders April: 7 orders May: 8 orders June: 13 orders July: 9 orders August: 8 orders Sepetember: 10 orders Octorbor: 4 orders November: 5 orders "
+ },
+ "intent_template_id": 270
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 112,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?",
+ "instantiation_dict": {
+ "product": "Circe fleece"
+ },
+ "intent": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Hannah Lim"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Hannah Lim"
+ },
+ "intent_template_id": 245
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 113,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?",
+ "instantiation_dict": {
+ "product": "Olivia zip jacket"
+ },
+ "intent": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Emma Lopez",
+ "Seam Miller"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Emma Lopez, Seam Miller"
+ },
+ "intent_template_id": 245
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 114,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the customers who have expressed dissatisfaction with {{product}}?",
+ "instantiation_dict": {
+ "product": "Antonia racer tank"
+ },
+ "intent": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Shaunte",
+ "Merrie"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Shaunte, Merrie"
+ },
+ "intent_template_id": 245
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 115,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}",
+ "instantiation_dict": {
+ "product": "Chloe tank"
+ },
+ "intent": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no negative review for Chloe tank"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": ""
+ },
+ "intent_template_id": 245
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 116,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the name of the customers who have expressed dissatisfaction with {{product}}?",
+ "instantiation_dict": {
+ "product": "tanks products"
+ },
+ "intent": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, Valorie"
+ },
+ "intent_template_id": 245
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 117,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the date when I made my first purchase on this site?",
+ "instantiation_dict": {},
+ "intent": "What is the date when I made my first purchase on this site?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "3/2/22"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "3/2/22"
+ },
+ "intent_template_id": 161
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 118,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have jaw bruxism problem, show me something that could alleviate the problem.",
+ "instantiation_dict": {},
+ "intent": "I have jaw bruxism problem, show me something that could alleviate the problem.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "jaw bruxism",
+ "mouth guard"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 151
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 119,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the reasons why customers like {{product}}",
+ "instantiation_dict": {
+ "product": "Antonia Racer Tank"
+ },
+ "intent": "Tell me the reasons why customers like Antonia Racer Tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Its color and style is good"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Its color and style is good"
+ },
+ "intent_template_id": 250
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 120,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the reasons why customers like {{product}}",
+ "instantiation_dict": {
+ "product": "Ana Running Short"
+ },
+ "intent": "Tell me the reasons why customers like Ana Running Short",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "It is comfortable"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "It is comfortable"
+ },
+ "intent_template_id": 250
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 121,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the reasons why customers like {{product}}",
+ "instantiation_dict": {
+ "product": "Circe hooded fleece"
+ },
+ "intent": "Tell me the reasons why customers like Circe hooded fleece",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Warm and comfortable. True to size."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Warm and comfortable. True to size."
+ },
+ "intent_template_id": 250
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 122,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the reasons why customers like {{product}}",
+ "instantiation_dict": {
+ "product": "Olivia zip jacket"
+ },
+ "intent": "Tell me the reasons why customers like Olivia zip jacket",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Lightweight, comfortable and stylish. Good design and details."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lightweight, comfortable, and stylish. Good design and details."
+ },
+ "intent_template_id": 250
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 123,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the reasons why customers like {{product}}",
+ "instantiation_dict": {
+ "product": "Circe's products"
+ },
+ "intent": "Tell me the reasons why customers like Circe's products",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Warm and comfortable. True to size."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Warm and comfortable. True to size."
+ },
+ "intent_template_id": 250
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 124,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range of {{product}} in the One Stop Market?",
+ "instantiation_dict": {
+ "product": "wireless earphone"
+ },
+ "intent": "What is the price range of wireless earphone in the One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$0.14 - $745.00"
+ },
+ "intent_template_id": 159
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 125,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range of {{product}} in the One Stop Market?",
+ "instantiation_dict": {
+ "product": "teeth grinding mouth guard"
+ },
+ "intent": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$1.46 - $85"
+ },
+ "intent_template_id": 159
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 126,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range of {{product}} in the One Stop Market?",
+ "instantiation_dict": {
+ "product": "Canon photo printer"
+ },
+ "intent": "What is the price range of Canon photo printer in the One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$2.56 - $649.99"
+ },
+ "intent_template_id": 159
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 127,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Which brands appear most often in the top-3 search terms?",
+ "instantiation_dict": {},
+ "intent": "Which brands appear most often in the top-3 search terms?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Hollister, Joust, nike"
+ },
+ "intent_template_id": 1001
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 128,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What's the total number of items sold in the most recent {{k}} orders?",
+ "instantiation_dict": {
+ "k": "2"
+ },
+ "intent": "What's the total number of items sold in the most recent 2 orders?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "9"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "9"
+ },
+ "intent_template_id": 1002
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 129,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What's the total number of items sold in the most recent {{k}} orders?",
+ "instantiation_dict": {
+ "k": "4"
+ },
+ "intent": "What's the total number of items sold in the most recent 4 orders?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "16"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "16"
+ },
+ "intent_template_id": 1002
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 130,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What's the total number of items sold in the most recent {{k}} orders?",
+ "instantiation_dict": {
+ "k": "5"
+ },
+ "intent": "What's the total number of items sold in the most recent 5 orders?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "18"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "18"
+ },
+ "intent_template_id": 1002
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 131,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What's the total number of items sold in the most recent {{k}} orders?",
+ "instantiation_dict": {
+ "k": "7"
+ },
+ "intent": "What's the total number of items sold in the most recent 7 orders?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "25"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "25"
+ },
+ "intent_template_id": 1002
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 132,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+ "instantiation_dict": {
+ "user": "kilian",
+ "repo": "a11yproject",
+ "date": "3/5/2023"
+ },
+ "intent": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 322
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 133,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+ "instantiation_dict": {
+ "user": "Eric",
+ "repo": "a11yproject",
+ "date": "3/2"
+ },
+ "intent": "How many commits did Eric make to a11yproject on 3/2?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2"
+ },
+ "intent_template_id": 322
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 134,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+ "instantiation_dict": {
+ "user": "kilian",
+ "repo": "a11yproject",
+ "date": "3/1/2023"
+ },
+ "intent": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 322
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 135,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+ "instantiation_dict": {
+ "user": "Eric and Kilian",
+ "repo": "a11yproject",
+ "date": "1/3/2023"
+ },
+ "intent": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 322
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 136,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make to {{repo}} on {{date}}?",
+ "instantiation_dict": {
+ "user": "Steven Woodson",
+ "repo": "a11y-webring.club",
+ "date": "2/6/2023"
+ },
+ "intent": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "5"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "5"
+ },
+ "intent_template_id": 322
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 137,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?",
+ "instantiation_dict": {
+ "city1": "the city where the Liberty Bell is located",
+ "city2": "the home city of Pirates"
+ },
+ "intent": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "5h 47min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "5h 47min"
+ },
+ "intent_template_id": 51
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 138,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?",
+ "instantiation_dict": {
+ "city1": "the big apple",
+ "city2": "the city with the most authentic Philly cheesesteaks"
+ },
+ "intent": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1h 58min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1h 58min"
+ },
+ "intent_template_id": 51
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 139,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?",
+ "instantiation_dict": {
+ "city1": "the hometown of Joe Biden",
+ "city2": "Bridgeport"
+ },
+ "intent": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "3h 20min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "3h 20min"
+ },
+ "intent_template_id": 51
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 140,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the estimated driving time between {{city1}} and {{city2}}?",
+ "instantiation_dict": {
+ "city1": "the city of Niagara Falls",
+ "city2": "the city of Yale University"
+ },
+ "intent": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "8h 33min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "8h 33min"
+ },
+ "intent_template_id": 51
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 141,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spent on {{category}} shopping during {{time}}",
+ "instantiation_dict": {
+ "category": "food-related",
+ "time": "March 2023"
+ },
+ "intent": "How much I spent on food-related shopping during March 2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "47.41"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$47.41"
+ },
+ "intent_template_id": 162
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 142,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spent on {{category}} shopping during {{time}}",
+ "instantiation_dict": {
+ "category": "hair care and hair style",
+ "time": "Jan 2023"
+ },
+ "intent": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "95.23"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$95.23"
+ },
+ "intent_template_id": 162
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 143,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spent on {{category}} shopping during {{time}}",
+ "instantiation_dict": {
+ "category": "home decoration",
+ "time": "1/29/2023"
+ },
+ "intent": "How much I spent on home decoration shopping during 1/29/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "265.69"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$265.69"
+ },
+ "intent_template_id": 162
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 144,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spent on {{category}} shopping during {{time}}",
+ "instantiation_dict": {
+ "category": "food",
+ "time": "from mid Jan to the end Jan 2023"
+ },
+ "intent": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 162
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 145,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spent on {{category}} shopping during {{time}}",
+ "instantiation_dict": {
+ "category": "cooking and food",
+ "time": "March 2022"
+ },
+ "intent": "How much I spent on cooking and food shopping during March 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$52.35"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$52.35"
+ },
+ "intent_template_id": 162
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 146,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}",
+ "instantiation_dict": {
+ "option": "size",
+ "product": "picture frame",
+ "time": "Sep 2022"
+ },
+ "intent": "What is the size configuration of the picture frame I bought Sep 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "16x24"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "16x24"
+ },
+ "intent_template_id": 155
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 147,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}",
+ "instantiation_dict": {
+ "option": "size",
+ "product": "picture frame",
+ "time": "2022"
+ },
+ "intent": "What is the size configuration of the picture frame I bought 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "16x24"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "16x24"
+ },
+ "intent_template_id": 155
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 148,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}",
+ "instantiation_dict": {
+ "option": "color",
+ "product": "picture frame",
+ "time": "Sep 2022"
+ },
+ "intent": "What is the color configuration of the picture frame I bought Sep 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Mist"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Mist"
+ },
+ "intent_template_id": 155
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 149,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}",
+ "instantiation_dict": {
+ "option": "color",
+ "product": "artifical plants",
+ "time": "Feb 2023"
+ },
+ "intent": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Green-vines"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Green-vines"
+ },
+ "intent_template_id": 155
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 150,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the {{option}} configuration of the {{product}} I bought {{time}}",
+ "instantiation_dict": {
+ "option": "price",
+ "product": "fake tree",
+ "time": "Jan 2023"
+ },
+ "intent": "What is the price configuration of the fake tree I bought Jan 2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$260.69"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$260.69"
+ },
+ "intent_template_id": 155
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 151,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?",
+ "instantiation_dict": {
+ "location1": "CMU",
+ "location2": "University of Pittsburgh"
+ },
+ "intent": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "4min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "4min"
+ },
+ "intent_template_id": 36
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 152,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?",
+ "instantiation_dict": {
+ "location1": "Schenley park",
+ "location2": "Upitt"
+ },
+ "intent": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "4min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "4min"
+ },
+ "intent_template_id": 36
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 153,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?",
+ "instantiation_dict": {
+ "location1": "REI",
+ "location2": "CMU"
+ },
+ "intent": "What is the minimum travel time by car from REI to CMU?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "7min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "7min"
+ },
+ "intent_template_id": 36
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 154,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?",
+ "instantiation_dict": {
+ "location1": "CMU gates building",
+ "location2": "Schenley park"
+ },
+ "intent": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "4min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "4min"
+ },
+ "intent_template_id": 36
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 155,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the minimum travel time by car from {{location1}} to {{location2}}?",
+ "instantiation_dict": {
+ "location1": "Animal Rescue League of Pittsburgh",
+ "location2": "Schenley park"
+ },
+ "intent": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "9min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "9min"
+ },
+ "intent_template_id": 36
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 156,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Checkout merge requests assigned to me",
+ "instantiation_dict": {},
+ "intent": "Checkout merge requests assigned to me",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/dashboard/merge_requests?assignee_username=byteblaze",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 290
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 157,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show all customers",
+ "instantiation_dict": {},
+ "intent": "Show all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/customer/index/",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 255
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 158,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards",
+ "instantiation_dict": {
+ "num": 11
+ },
+ "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 11 cards",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 171
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 159,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards",
+ "instantiation_dict": {
+ "num": 31
+ },
+ "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 31 cards",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 171
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 160,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards",
+ "instantiation_dict": {
+ "num": 6
+ },
+ "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 6 cards",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 171
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 161,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards",
+ "instantiation_dict": {
+ "num": 23
+ },
+ "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 23 cards",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/heiying-game-card-case-for-nintendo-switch-switch-oled-game-card-or-micro-sd-memory-cards-portable-switch-game-memory-card-storage-with-24-game-card-slots-and-24-micro-sd-card-slots-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 171
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 162,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all {{num}} cards",
+ "instantiation_dict": {
+ "num": 40
+ },
+ "intent": "I have a lot of Nintendo Switch game cards now, help me find the best storage option to fit all 40 cards",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 171
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 163,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/ostent-16gb-memory-card-stick-storage-for-sony-ps-vita-psv1000-2000-pch-z081-z161-z321-z641.html",
+ "geolocation": null,
+ "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "instantiation_dict": {},
+ "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA"
+ },
+ "intent_template_id": 136
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 164,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/mineralogie-all-natural-lip-gloss-ruby-rose.html",
+ "geolocation": null,
+ "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "instantiation_dict": {},
+ "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Dry",
+ "Uneven color"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "DryUneven color"
+ },
+ "intent_template_id": 136
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 165,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/sandgrens-swedish-handmade-wooden-clog-sandal-copenhagen.html",
+ "geolocation": null,
+ "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "instantiation_dict": {},
+ "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "The 39 was too small. I am afraid the 40 will be too big. I was very sad when the shoe rubbed up against my baby toe. I had to return them because I knew in time it would tear up my feet. The problem is that the strap is made of some really stiff leather and is painful to my heel. The front is also uncomfortably tight. The Dansko's were similar (not as bad) and loosened up over time."
+ },
+ "intent_template_id": 136
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 166,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/sensodyne-repair-protect-whitening-toothpaste-with-fluoride-3-4-oz-pack-of-3.html",
+ "geolocation": null,
+ "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "instantiation_dict": {},
+ "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "there is no existing criticism for the given product. All reviews are positive."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 136
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 167,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/photosmart-plus-b209-clr-inkjetfb-p-s-c-usb-wrls-1.html",
+ "geolocation": null,
+ "intent_template": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "instantiation_dict": {},
+ "intent": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had, but now I understand why. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ "intent_template_id": 136
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 168,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?",
+ "instantiation_dict": {
+ "description": "more than 100"
+ },
+ "intent": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "Among the repositories where I made contributions, there is no repository that has more than 100 stars."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "No repo found"
+ },
+ "intent_template_id": 289
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 169,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?",
+ "instantiation_dict": {
+ "description": "the most"
+ },
+ "intent": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "a11yproject.com",
+ "design"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "a11yproject.com, Primer/design"
+ },
+ "intent_template_id": 289
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 170,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?",
+ "instantiation_dict": {
+ "description": "the least"
+ },
+ "intent": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline"
+ },
+ "intent_template_id": 289
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 171,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?",
+ "instantiation_dict": {
+ "description": "less than 5"
+ },
+ "intent": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline"
+ },
+ "intent_template_id": 289
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 172,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me the full names of the repositories where I made contributions and they got {{description}} stars?",
+ "instantiation_dict": {
+ "description": "no"
+ },
+ "intent": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, remove-board-movement-events-from-the-github-issue-timeline"
+ },
+ "intent_template_id": 289
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 173,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "better"
+ },
+ "intent": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No, it is open"
+ },
+ "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/8",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": "",
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 310
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 174,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "feature"
+ },
+ "intent": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No, it is open"
+ },
+ "reference_url": "__GITLAB__/byteblaze/a11y-webring.club/-/issues/71",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": ""
+ },
+ "intent_template_id": 310
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 175,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "dependency"
+ },
+ "intent": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No, it is open"
+ },
+ "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": ""
+ },
+ "intent_template_id": 310
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 176,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "theme editor"
+ },
+ "intent": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No, it is open"
+ },
+ "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": ""
+ },
+ "intent_template_id": 310
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 177,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest updated issue that has keyword \"{{keyword}}\" in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "homepage content"
+ },
+ "intent": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes, it is closed"
+ },
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719",
+ "program_html": [],
+ "reference_answer_raw_annotation": "closed",
+ "string_note": ""
+ },
+ "intent_template_id": 310
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 178,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "better"
+ },
+ "intent": "Open my latest created issue that has better in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/566",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Closed",
+ "string_note": ""
+ },
+ "intent_template_id": 500
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 179,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "feature"
+ },
+ "intent": "Open my latest created issue that has feature in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/1517",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Closed",
+ "string_note": ""
+ },
+ "intent_template_id": 500
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 180,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "dependency"
+ },
+ "intent": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No"
+ },
+ "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues/18",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": ""
+ },
+ "intent_template_id": 500
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 181,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "theme editor"
+ },
+ "intent": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "No"
+ },
+ "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues/1",
+ "program_html": [],
+ "reference_answer_raw_annotation": "Not closed",
+ "string_note": ""
+ },
+ "intent_template_id": 500
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 182,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open my latest created issue that has {{keyword}} in its title to check if it is closed",
+ "instantiation_dict": {
+ "keyword": "homepage content"
+ },
+ "intent": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match",
+ "url_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Yes"
+ },
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/719",
+ "program_html": [],
+ "reference_answer_raw_annotation": "closed",
+ "string_note": ""
+ },
+ "intent_template_id": 500
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 183,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left",
+ "instantiation_dict": {
+ "Attribute": "SKU",
+ "N": "10"
+ },
+ "intent": "Give me the SKU of the products that have 10 units left",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no product that has 10 quantities left."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "There is no product that has 10 quantities left."
+ },
+ "intent_template_id": 368
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 184,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left",
+ "instantiation_dict": {
+ "Attribute": "name",
+ "N": "0"
+ },
+ "intent": "Give me the name of the products that have 0 units left",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Sinbad Fitness Tank"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Sinbad Fitness Tank"
+ },
+ "intent_template_id": 368
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 185,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left",
+ "instantiation_dict": {
+ "Attribute": "brand",
+ "N": "3"
+ },
+ "intent": "Give me the brand of the products that have 3 units left",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Eos",
+ "Minerva"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Eos, Minerva"
+ },
+ "intent_template_id": 368
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 186,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left",
+ "instantiation_dict": {
+ "Attribute": "product names and the sizes",
+ "N": "2-3"
+ },
+ "intent": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Eos V-Neck Hoodie: S Minera Luma Tech V-Tee: XS"
+ },
+ "intent_template_id": 368
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 187,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Give me the {{Attribute}} of the products that have {{N}} units left",
+ "instantiation_dict": {
+ "Attribute": "SKU",
+ "N": "1-3"
+ },
+ "intent": "Give me the SKU of the products that have 1-3 units left",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "WH11-S-Blue, WS08-XS-Blue"
+ },
+ "intent_template_id": 368
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 188,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the total cost of my latest {{status}} order?",
+ "instantiation_dict": {
+ "status": "cancelled"
+ },
+ "intent": "Tell me the total cost of my latest cancelled order?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$365.42"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$365.42"
+ },
+ "intent_template_id": 214
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 189,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the total cost of my latest {{status}} order?",
+ "instantiation_dict": {
+ "status": "pending"
+ },
+ "intent": "Tell me the total cost of my latest pending order?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$754.99"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$754.99"
+ },
+ "intent_template_id": 214
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 190,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the total cost of my latest {{status}} order?",
+ "instantiation_dict": {
+ "status": "complete"
+ },
+ "intent": "Tell me the total cost of my latest complete order?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$65.32"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$65.32"
+ },
+ "intent_template_id": 214
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 191,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the total cost of my latest {{status}} order?",
+ "instantiation_dict": {
+ "status": "processing"
+ },
+ "intent": "Tell me the total cost of my latest processing order?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no order of \"processing\" status"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "There is no order of \"processing\" status"
+ },
+ "intent_template_id": 214
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 192,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me the total cost of my latest {{status}} order?",
+ "instantiation_dict": {
+ "status": "non-cancelled"
+ },
+ "intent": "Tell me the total cost of my latest non-cancelled order?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$754.99"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$754.99"
+ },
+ "intent_template_id": 214
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 193,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders",
+ "instantiation_dict": {
+ "status": "completed",
+ "N": "2"
+ },
+ "intent": "Get the total payment amount of the last 2 completed orders",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$182.4"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$182.4"
+ },
+ "intent_template_id": 367
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 194,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders",
+ "instantiation_dict": {
+ "status": "completed",
+ "N": "5"
+ },
+ "intent": "Get the total payment amount of the last 5 completed orders",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$555.2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$555.2"
+ },
+ "intent_template_id": 367
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 195,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders",
+ "instantiation_dict": {
+ "status": "pending",
+ "N": "5"
+ },
+ "intent": "Get the total payment amount of the last 5 pending orders",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$885.4"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$885.4"
+ },
+ "intent_template_id": 367
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 196,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Compare the payment difference of the last {{N}} {{status_1}} orders and {{status_2}} orders",
+ "instantiation_dict": {
+ "status_1": "cancelled",
+ "status_2": "completed",
+ "N": "4"
+ },
+ "intent": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "194.25"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "194.25"
+ },
+ "intent_template_id": 367
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 197,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the total payment amount of the last {{N}} {{status}} orders",
+ "instantiation_dict": {
+ "status": "non-cancelled",
+ "N": "5"
+ },
+ "intent": "Get the total payment amount of the last 5 non-cancelled orders",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$778.2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "annotation_note": "219.4+210+166.4+93.4+89",
+ "reference_answer_raw_annotation": "$778.2"
+ },
+ "intent_template_id": 367
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 198,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "customer name",
+ "status": "most recent cancelled"
+ },
+ "intent": "Get the customer name of the most recent cancelled order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Lily Potter"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lily Potter"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 199,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "order ID",
+ "status": "newest pending"
+ },
+ "intent": "Get the order ID of the newest pending order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "299"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "299"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 200,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "billing name",
+ "status": "oldest complete"
+ },
+ "intent": "Get the billing name of the oldest complete order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "John Lee"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "John Lee"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 201,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "customer name",
+ "status": "earliest fraud suspect"
+ },
+ "intent": "Get the customer name of the earliest fraud suspect order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no order of \"fraud suspect\" status"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 202,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "date",
+ "status": "most recent cancelled"
+ },
+ "intent": "Get the date of the most recent cancelled order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "May 23 2023"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "May 23, 2023"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 203,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "purchase date and order id",
+ "status": "most recent pending"
+ },
+ "intent": "Get the purchase date and order id of the most recent pending order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "000000299, May 31, 2023, 2:55:09 AM"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 204,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Get the {{attribute}} of the {{status}} order",
+ "instantiation_dict": {
+ "attribute": "product name and discounted price (low to high)",
+ "status": "most recent completed"
+ },
+ "intent": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Rapha Sports Short: $35 Thorpe Track Pant: $54.4 Mach Street Sweatshirt: $62"
+ },
+ "intent_template_id": 366
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 205,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make on {{date}}?",
+ "instantiation_dict": {
+ "user": "kilian",
+ "date": "3/5/2023"
+ },
+ "intent": "How many commits did kilian make on 3/5/2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 320
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 206,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make on {{date}}?",
+ "instantiation_dict": {
+ "user": "Eric",
+ "date": "3/2"
+ },
+ "intent": "How many commits did Eric make on 3/2?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2"
+ },
+ "intent_template_id": 320
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 207,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make on {{date}} in total?",
+ "instantiation_dict": {
+ "user": "Eric and Kilian",
+ "date": "1/3/2023"
+ },
+ "intent": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 320
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 208,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Find the customer name and email with phone number {{PhoneNum}}",
+ "instantiation_dict": {
+ "PhoneNum": "+1 2058812302"
+ },
+ "intent": "Find the customer name and email with phone number +1 2058812302",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "John Smith, john.smith.xyz@gmail.com"
+ },
+ "intent_template_id": 364
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 209,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Find the customer name and email with phone number {{PhoneNum}}",
+ "instantiation_dict": {
+ "PhoneNum": "2137418080"
+ },
+ "intent": "Find the customer name and email with phone number 2137418080",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Jennifer White, jennifer.white@yahoo.com"
+ },
+ "intent_template_id": 364
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 210,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Find the customer name and email with phone number {{PhoneNum}}",
+ "instantiation_dict": {
+ "PhoneNum": "2065555555"
+ },
+ "intent": "Find the customer name and email with phone number 2065555555",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Adam Garcia, gamingpro456@gmail.com"
+ },
+ "intent_template_id": 364
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 211,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Find the customer name and email with phone number {{PhoneNum}}",
+ "instantiation_dict": {
+ "PhoneNum": "8015551212"
+ },
+ "intent": "Find the customer name and email with phone number 8015551212",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Sean Miller, sean.miller@gmail.com"
+ },
+ "intent_template_id": 364
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 212,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Find the customer name and email with phone number {{PhoneNum}}",
+ "instantiation_dict": {
+ "PhoneNum": "555-229-3326"
+ },
+ "intent": "Find the customer name and email with phone number 555-229-3326",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Veronica Costello, roni_cost@example.com"
+ },
+ "intent_template_id": 364
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 213,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the key aspects that the customers don't like about {{product}}",
+ "instantiation_dict": {
+ "product": "Antonia Racer Tank"
+ },
+ "intent": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Not suitable for high-impact workouts"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Not suitable for high-impact workouts"
+ },
+ "intent_template_id": 249
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 214,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the key aspects that the customers don't like about {{product}}",
+ "instantiation_dict": {
+ "product": "Zing Jump Rope"
+ },
+ "intent": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "It is hard to find the right size. Won't last long"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "It is hard to find the right size. Won't last long"
+ },
+ "intent_template_id": 249
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 215,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the key aspects that the customers don't like about {{product}}",
+ "instantiation_dict": {
+ "product": "Circe ice fleece"
+ },
+ "intent": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Material quality, fit, insufficient warmth, color"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Material quality, fit, insufficient warmth, color"
+ },
+ "intent_template_id": 249
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 216,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the key aspects that the customers don't like about {{product}}",
+ "instantiation_dict": {
+ "product": "Electra Bra Top"
+ },
+ "intent": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Not true to size"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Not true to size"
+ },
+ "intent_template_id": 249
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 217,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "What are the key aspects that the customers don't like about {{product}}",
+ "instantiation_dict": {
+ "product": "Pursuit Tone Band"
+ },
+ "intent": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Insufficient resistance for their workouts."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Insufficient resistance for their workouts."
+ },
+ "intent_template_id": 249
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 218,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?",
+ "instantiation_dict": {
+ "location": "CMU, Pittsburgh",
+ "n": "5"
+ },
+ "intent": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no hotel near CMU that is within 5 minutes walking distance"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 41
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 219,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?",
+ "instantiation_dict": {
+ "location": "Pittsburgh airport",
+ "n": "3"
+ },
+ "intent": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 41
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 220,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the walking distance from nearby hotels to {{location}} that take at most {{n}} minutes?",
+ "instantiation_dict": {
+ "location": "Gardner Steel Conference Center,",
+ "n": 5
+ },
+ "intent": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Wyndham Pittsburgh University Cente: 375 m\nThe Oaklander Hotel: 338 m"
+ },
+ "intent_template_id": 41
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 221,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am at CMU Pittsburgh, how long it takes to the nearest {{location}} with different transportation methods?",
+ "instantiation_dict": {
+ "location": "USPS postal office"
+ },
+ "intent": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Walk: 1 minute to walk and\nDrive: less than 1 minute\nBike: less than 1 minute"
+ },
+ "intent_template_id": 35
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 222,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}",
+ "instantiation_dict": {
+ "location": "cold stone ice cream"
+ },
+ "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "3min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "3min"
+ },
+ "intent_template_id": 35
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 223,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}",
+ "instantiation_dict": {
+ "location": "Mcdonald's"
+ },
+ "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "4min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "4min"
+ },
+ "intent_template_id": 35
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 224,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am at CMU Pittsburgh, how long it takes to drive to the nearest {{location}}",
+ "instantiation_dict": {
+ "location": "wendys"
+ },
+ "intent": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "3min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "3min"
+ },
+ "intent_template_id": 35
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 225,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What do customers say about {{product_type}} from {{manufature}}",
+ "instantiation_dict": {
+ "product_type": "brush",
+ "manufature": "sephora"
+ },
+ "intent": "What do customers say about brush from sephora",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The sephora brushes don't have reviews"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 135
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 226,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range for products from {{brand}}?",
+ "instantiation_dict": {
+ "brand": "Amazon basic"
+ },
+ "intent": "What is the price range for products from Amazon basic?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$5.49 - $375.19"
+ },
+ "intent_template_id": 370
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 227,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range for products from {{brand}}?",
+ "instantiation_dict": {
+ "brand": "EYZUTAK"
+ },
+ "intent": "What is the price range for products from EYZUTAK?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$9.99"
+ },
+ "intent_template_id": 370
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 228,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range for products from {{brand}}?",
+ "instantiation_dict": {
+ "brand": "sephora"
+ },
+ "intent": "What is the price range for products from sephora?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$18.18 - $94.99"
+ },
+ "intent_template_id": 370
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 229,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range for products from {{brand}}?",
+ "instantiation_dict": {
+ "brand": "ugreen"
+ },
+ "intent": "What is the price range for products from ugreen?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$6.99 - $38.99"
+ },
+ "intent_template_id": 370
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 230,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the price range for products from {{brand}}?",
+ "instantiation_dict": {
+ "brand": "Perricone MD"
+ },
+ "intent": "What is the price range for products from Perricone MD?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the highest price is $149?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$35 - $149"
+ },
+ "intent_template_id": 370
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 231,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Get the order number of my most recent {{status}} order ",
+ "instantiation_dict": {
+ "status": "cancelled"
+ },
+ "intent": "Get the order number of my most recent cancelled order ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "170"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "000000170"
+ },
+ "intent_template_id": 213
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 232,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Get the order number of my most recent {{status}} order ",
+ "instantiation_dict": {
+ "status": "pending"
+ },
+ "intent": "Get the order number of my most recent pending order ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "189"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "000000189"
+ },
+ "intent_template_id": 213
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 233,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Get the order number of my most recent {{status}} order ",
+ "instantiation_dict": {
+ "status": "complete"
+ },
+ "intent": "Get the order number of my most recent complete order ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "180"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "000000180"
+ },
+ "intent_template_id": 213
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 234,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Get the order number of my most recent {{status}} order ",
+ "instantiation_dict": {
+ "status": "on hold"
+ },
+ "intent": "Get the order number of my most recent on hold order ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "there is no on hold order"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 213
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 235,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Get the order number of my most recent {{status}} order ",
+ "instantiation_dict": {
+ "status": "under delivery"
+ },
+ "intent": "Get the order number of my most recent under delivery order ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no under delivery order"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 213
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 236,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}",
+ "instantiation_dict": {
+ "location": "pharmacy",
+ "location2": "Carnegie Mellon",
+ "condition": "I can walk within 20mins"
+ },
+ "intent": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Schiller's Pharmacy, 811, South Aiken Avenue, Shadyside, Pittsburgh, Allegheny County, 15232, United States"
+ },
+ "intent_template_id": 39
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 237,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Where is the nearest {{location}} from {{location2}} {{condition}}",
+ "instantiation_dict": {
+ "location": "gas station",
+ "location2": "CMU",
+ "condition": ""
+ },
+ "intent": "Where is the nearest gas station from CMU ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Sunoco, North Craig Street, North Oakland, Pittsburgh, Allegheny County, 15213, United States"
+ },
+ "intent_template_id": 39
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 238,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I am doing a market survey for one stop market, show me the product page of the most expensive product from {{product_category}} category",
+ "instantiation_dict": {
+ "product_category": "PS4 accessories"
+ },
+ "intent": "I am doing a market survey for one stop market, show me the product page of the most expensive product from PS4 accessories category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/astro-gaming-a50-wireless-headset-base-station-gen-4-compatible-with-ps5-ps4-pc-mac-black-silver.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 138
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 239,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I am doing a market survey for one stop market, show me the product page of the most expensive product from {{product_category}} category",
+ "instantiation_dict": {
+ "product_category": "nutrition bars and drinks"
+ },
+ "intent": "I am doing a market survey for one stop market, show me the product page of the most expensive product from nutrition bars and drinks category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/kellogg-s-special-k-protein-meal-bars-chocolate-caramel-12-7oz-6-count.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 138
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 240,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I am doing a market survey for one stop market, show me the product page of the most expensive product from {{product_category}} category",
+ "instantiation_dict": {
+ "product_category": "competitive swimwear"
+ },
+ "intent": "I am doing a market survey for one stop market, show me the product page of the most expensive product from competitive swimwear category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/women-cross-flower-beachwear-tankini-bandeau-bandage-bikini-set-push-up-swimwear-bathing-suit-two-pieces-swimsuits.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 138
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 241,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I am doing a market survey for one stop market, show me the product page of the most expensive product from {{product_category}} category",
+ "instantiation_dict": {
+ "product_category": "skin care tool"
+ },
+ "intent": "I am doing a market survey for one stop market, show me the product page of the most expensive product from skin care tool category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/professional-medi-spa-scar-stretch-mark-reduction-system.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 138
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 242,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I am doing a market survey for one stop market, show me the product page of the most expensive product from {{product_category}} category",
+ "instantiation_dict": {
+ "product_category": "Household Supplies"
+ },
+ "intent": "I am doing a market survey for one stop market, show me the product page of the most expensive product from Household Supplies category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/lynx-battery-12v-200ah-lithium-iron-phosphate-lifepo4-prismatic-deep-cell-battery-set-of-4-3-2v-cells-with-3-bus-bars-and-8-lug-nuts-for-rv-solar-marine-off-grid-applications.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 138
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 243,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}",
+ "instantiation_dict": {
+ "information": "email address",
+ "product": "Circe fleece"
+ },
+ "intent": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "hannah.lim@gmail.com"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "hannah.lim@gmail.com"
+ },
+ "intent_template_id": 244
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 244,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}",
+ "instantiation_dict": {
+ "information": "email address",
+ "product": "Olivia zip jacket"
+ },
+ "intent": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "emma.lopez@gmail.com"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "emma.lopez@gmail.com"
+ },
+ "intent_template_id": 244
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 245,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}",
+ "instantiation_dict": {
+ "information": "name",
+ "product": "Antonia racer tank"
+ },
+ "intent": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Shaunte"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Shaunte"
+ },
+ "intent_template_id": 244
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 246,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}",
+ "instantiation_dict": {
+ "information": "name",
+ "product": "Chloe tank"
+ },
+ "intent": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Teofila"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Teofila"
+ },
+ "intent_template_id": 244
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 247,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Show me the {{information}} of the customer who is the most unhappy with {{product}}",
+ "instantiation_dict": {
+ "information": "email address",
+ "product": "the style of Zoe products"
+ },
+ "intent": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no negative review for Zoe products, all reviews are positive."
+ },
+ "reference_url": "Valorie doesn't have a email in the system",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 244
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 248,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the coordinates of {{location}} in DD format",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon Caf\u00e9"
+ },
+ "intent": "Tell me the coordinates of Carnegie Mellon Caf\u00e9 in DD format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "40.442",
+ "-79.939"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "40.4424191, -79.9397388"
+ },
+ "intent_template_id": 46
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 249,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the coordinates of {{location}} in DD format",
+ "instantiation_dict": {
+ "location": "Western Pennsylvania Hospital Heliport"
+ },
+ "intent": "Tell me the coordinates of Western Pennsylvania Hospital Heliport in DD format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "40.460",
+ "-79.946"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "40.46076, -79.94666"
+ },
+ "intent_template_id": 46
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 250,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the coordinates of {{location}} in DD format",
+ "instantiation_dict": {
+ "location": "Apple Store near Pitt"
+ },
+ "intent": "Tell me the coordinates of Apple Store near Pitt in DD format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "40.451",
+ "-79.933"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "40.4511693, -79.9334241"
+ },
+ "intent_template_id": 46
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 251,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the coordinates of {{location}} in DD format",
+ "instantiation_dict": {
+ "location": "bus stop on the Carnegie art museum side of the street near CMU"
+ },
+ "intent": "Tell me the coordinates of bus stop on the Carnegie art museum side of the street near CMU in DD format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "40.444",
+ "-79.948"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "40.4443, -79.94889"
+ },
+ "intent_template_id": 46
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 252,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Tell me the coordinates of {{location}} in DD format",
+ "instantiation_dict": {
+ "location": "Tokyo Japanese Food Store in Pittsburgh"
+ },
+ "intent": "Tell me the coordinates of Tokyo Japanese Food Store in Pittsburgh in DD format",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "40.457",
+ "-79.929"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "40.45761, -79.92934"
+ },
+ "intent_template_id": 46
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 253,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the {{information}} of {{location}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon Caf\u00e9",
+ "information": "phone number"
+ },
+ "intent": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The phone number does not exist in the page"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 501
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 254,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the {{information}} of {{location}}",
+ "instantiation_dict": {
+ "location": "Western Pennsylvania Hospital",
+ "information": "phone number"
+ },
+ "intent": "What is the phone number of Western Pennsylvania Hospital",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "4125785000"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "4125785000"
+ },
+ "intent_template_id": 501
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 255,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Who is the {{information}} of {{location}}",
+ "instantiation_dict": {
+ "location": "PIT airport",
+ "information": "operator"
+ },
+ "intent": "Who is the operator of PIT airport",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Allegheny County Airport Authority"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Allegheny County Airport Authority"
+ },
+ "intent_template_id": 501
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 256,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the {{information}} of {{location}}",
+ "instantiation_dict": {
+ "location": "Carnegie art museum in pittsburgh",
+ "information": "website"
+ },
+ "intent": "What is the website of Carnegie art museum in pittsburgh",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "http://web.cmoa.org/"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "http://web.cmoa.org/"
+ },
+ "intent_template_id": 501
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 257,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What is the {{information}} of {{location}}",
+ "instantiation_dict": {
+ "location": "Tokyo Japanese Food Store in Pittsburgh",
+ "information": "hours of operation"
+ },
+ "intent": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "We-Su 10:00-17:00"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "We-Su 10:00-17:00"
+ },
+ "intent_template_id": 501
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 258,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "See all public projects",
+ "instantiation_dict": {},
+ "intent": "See all public projects",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/explore",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 325
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 259,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Get me my RSS feed token",
+ "instantiation_dict": {},
+ "intent": "Get me my RSS feed token",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "TMN_bBn9Z48qVbUFZV45"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "TMN_bBn9Z48qVbUFZV45"
+ },
+ "intent_template_id": 312
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 260,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I want to browse the products in the {{category}} category",
+ "instantiation_dict": {
+ "category": "Video Game"
+ },
+ "intent": "I want to browse the products in the Video Game category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/video-games.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 211
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 261,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I want to browse the products in the {{category}} category",
+ "instantiation_dict": {
+ "category": "Headphones"
+ },
+ "intent": "I want to browse the products in the Headphones category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/electronics/headphones.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 211
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 262,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I want to browse the products in the {{category}} category",
+ "instantiation_dict": {
+ "category": "Men shoes"
+ },
+ "intent": "I want to browse the products in the Men shoes category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 211
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 263,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I want to browse the products in the {{category}} category",
+ "instantiation_dict": {
+ "category": "Woman clothing"
+ },
+ "intent": "I want to browse the products in the Woman clothing category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/clothing.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 211
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 264,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I want to browse the products in the {{category}} category",
+ "instantiation_dict": {
+ "category": "Cabinets, Racks & Shelves"
+ },
+ "intent": "I want to browse the products in the Cabinets, Racks & Shelves category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/office-products/office-furniture-lighting/cabinets-racks-shelves.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 211
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 265,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What's the closest national park to {{city}}? How far is it to drive there?",
+ "instantiation_dict": {
+ "city": "Boston"
+ },
+ "intent": "What's the closest national park to Boston? How far is it to drive there?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Acadia National Park\n457km"
+ },
+ "intent_template_id": 85
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 266,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What's the closest national park to {{city}}?",
+ "instantiation_dict": {
+ "city": "the largest city in Maine"
+ },
+ "intent": "What's the closest national park to the largest city in Maine?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Acadia National Park"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Acadia National Park"
+ },
+ "intent_template_id": 85
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 267,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What's the closest national park to {{city}}? How long it takes to drive there?",
+ "instantiation_dict": {
+ "city": "the hometown of Stephen King"
+ },
+ "intent": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Acadia National Park\n1h 23min"
+ },
+ "intent_template_id": 85
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 268,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "What's the closest national park to {{city}}? How long does it take to bike there?",
+ "instantiation_dict": {
+ "city": "Vinalhaven, ME"
+ },
+ "intent": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Acadia National Park\n10h 33min"
+ },
+ "intent_template_id": 85
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 269,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category",
+ "instantiation_dict": {
+ "price": "25",
+ "product_category": "women shoes"
+ },
+ "intent": "Show me products under $25 in \"women shoes\" category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/clothing-shoes-jewelry/women/shoes.html?price=0-25",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 139
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 270,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category",
+ "instantiation_dict": {
+ "price": "30",
+ "product_category": "men shoes"
+ },
+ "intent": "Show me products under $30 in \"men shoes\" category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/clothing-shoes-jewelry/men/shoes.html?price=0-30",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 139
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 271,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category",
+ "instantiation_dict": {
+ "price": "46.99",
+ "product_category": "makeup remover"
+ },
+ "intent": "Show me products under $46.99 in \"makeup remover\" category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/beauty-personal-care/makeup/makeup-remover.html?price=0-46.99",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 139
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 272,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category",
+ "instantiation_dict": {
+ "price": "78",
+ "product_category": "children dental care"
+ },
+ "intent": "Show me products under $78 in \"children dental care\" category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/beauty-personal-care/oral-care/children-s-dental-care.html?price=0-78",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 139
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 273,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me products under ${{price}} in \"{{product_category}}\" category",
+ "instantiation_dict": {
+ "price": "199",
+ "product_category": "furtiture with accent"
+ },
+ "intent": "Show me products under $199 in \"furtiture with accent\" category",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/home-kitchen/furniture/accent-furniture.html?price=0-199",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 139
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 274,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Search for \"{{keyword}}\"",
+ "instantiation_dict": {
+ "keyword": "usb wifi"
+ },
+ "intent": "Search for \"usb wifi\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=usb+wifi",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 212
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 275,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Search for \"{{keyword}}\"",
+ "instantiation_dict": {
+ "keyword": "xbox"
+ },
+ "intent": "Search for \"xbox\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=xbox",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 212
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 276,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Search for \"{{keyword}}\"",
+ "instantiation_dict": {
+ "keyword": "switch accessories"
+ },
+ "intent": "Search for \"switch accessories\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=switch+accessories",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 212
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 277,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Search for \"{{keyword}}\"",
+ "instantiation_dict": {
+ "keyword": "batteries for iphone 13"
+ },
+ "intent": "Search for \"batteries for iphone 13\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=iphone+13",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 212
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 278,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Search for \"{{keyword}}\"",
+ "instantiation_dict": {
+ "keyword": "green tea bag for weight loss"
+ },
+ "intent": "Search for \"green tea bag for weight loss\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=green+tea+bag+for+weight+loss",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 212
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 279,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "instantiation_dict": {},
+ "intent": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed)",
+ "Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones",
+ "Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus",
+ "Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black",
+ "Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items)",
+ "Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W)",
+ "Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B)",
+ "Sony MDRAS600BT Active Sports Bluetooth Headset (Black)",
+ "Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items)",
+ "Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R)",
+ "Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black",
+ "SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed"
+ ],
+ "context_qa": [
+ {
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage mention the highest price is $406?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "These models are avaiable: SONY WH1000XM3 Bluetooth Wireless Noise Canceling Headphones Silver WH-1000XM3/S (Renewed) Sony WH-CH710N/H Wireless Bluetooth Noise Cancelling Headphones Sony WH-1000XM3B Wireless Bluetooth Noise-Canceling Over-Ear Headphones (Black) Basic Headphone Bundle Kit with Stylus Sony Wireless Headphones WH-CH510: Wireless Bluetooth On-Ear Headset with Mic for Phone-Call, Black Sony WHCH710N Wireless Bluetooth Noise Canceling Over-The-Ear Headphones (Black) with Kratos 18W PD Two-Port Power Adapter and Kratos 6-Feet Nylon Braided USB-C Cable Bundle (3 Items) Sony WI-SP500 Wireless in-Ear Sports Headphones, White (WISP500/W) Sony WI-SP510 Extra BASS Wireless in-Ear Headset/Headphones with mic for Phone Call Sports IPX5 Bluetooth, Black (WISP510/B) Sony MDRAS600BT Active Sports Bluetooth Headset (Black) Sony WH-1000XM4 Wireless Noise Canceling Over-Ear Headphones (Black) with Sony WLA-NS7 Wireless TV Adapter Bundle (2 Items) Sony WI-C300 Wireless In-Ear Headphones, Red (WIC300/R) Sony XB950N1 Extra Bass Wireless Noise Canceling Headphones, Black SONY - H900N Hi-Res Noise Cancelling Wireless Headphone Grayish Black Renewed The price ranges from $18.99 to $406 "
+ },
+ "intent_template_id": 204
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 280,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "instantiation_dict": {},
+ "intent": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage mention the highest price is $59.99?",
+ "answer": "Yes"
+ }
+ ],
+ "must_include": [
+ "Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More",
+ "Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included)",
+ "Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278)",
+ "5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply",
+ "Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter)",
+ "Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter)",
+ "USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More",
+ "iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy",
+ "USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More",
+ "Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111)",
+ "Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More",
+ "USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included)"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "These models are availiable: Anker USB C Charger 30W, 711 Charger, Compact Fast Charger (Not Foldable) for MacBook Air/iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy S21, Note 20, iPad Pro, Pixel, and More Anker USB C Charger 40W, 521 Charger (Nano Pro), PIQ 3.0 Durable Compact Fast Charger (Not Foldable) for iPhone 13/13 Mini/13 Pro/13 Pro Max/12, Galaxy, Pixel 4/3, iPad/iPad Mini (Cable Not Included) Anker PowerCore Speed 20000, 20000mAh Qualcomm Quick Charge 3.0 & PowerIQ Portable Charger, with Quick Charge Recharging, Power Bank for Samsung, iPhone, iPad and More, Black (A1278) 5Ft Micro-USB Charger Cord Cable Fit for Anker-PowerCore 5000 10000 20100 13000 26800 Mini 3350 Fusion II 15000 Redux 20000 Slim 10000 Astro E1 AC Replacement Power Adapter Supply Anker 10W Max Wireless Charger, 313 Wireless Charger (Pad), Qi-Certified Wireless Charging 7.5W for iPhone 12/12 Pro/12 mini/12 Pro Max, 10W for Galaxy S10 S9 S8, S9 Plus, Note 9 (No AC Adapter) Anker Wireless Charger, 313 Wireless Charger (Stand), Qi-Certified for iPhone 12, 12 Pro Max, SE, 11, 11 Pro, 11 Pro Max, XR, XS Max, 10W Fast-Charging Galaxy S20, S10 (No AC Adapter) USB Charger, Anker Elite Dual Port 24W Wall Charger, PowerPort 2 with PowerIQ and Foldable Plug, for iPhone 11/Xs/XS Max/XR/X/8/7/6/Plus, iPad Pro/Air 2/Mini 3/Mini 4, Samsung S4/S5, and More iPhone 12 Charger [GaN Tech], Anker 30W Compact USB-C Wall Charger with Power Delivery, PowerPort Atom for iPhone 12 / Mini/Pro/Pro Max / 11 / X/XS/XR, iPad Pro, MacBook 12'', Pixel, Galaxy USB C Charger, Anker 30W 2 Port Fast Charger with 18W USB C Power Adapter, Foldable PowerPort PD 2 Charger for iPad Pro, iPhone 11/11 Pro / 11 Pro Max/XS/Max/XR/X, Pixel, Galaxy, and More Anker 40W 5-Port USB Wall Charger, PowerPort 5 for iPhone XS / XS Max / XR / X / 8 / 7 / 6 / Plus, iPad Pro / Air 2 / mini, Galaxy S9 / S8 / Edge / Plus, Note 8 / 7, LG, Nexus, HTC and More, Black (AK-A2124111) Anker Quick Charge 3.0 39W Dual USB Wall Charger, PowerPort Speed 2 for Galaxy S10/S9/S8/Edge/Plus, Note 8/7 and PowerIQ for iPhone Xs/XS Max/XR/X/8/Plus, iPad Pro/Air 2/Mini, LG, Nexus, HTC and More USB C Charger, Anker 20W PIQ 3.0 Fast Charger with Foldable Plug, PowerPort III Charger for iPhone 13/13 Mini/13 Pro/13 Pro Max/12/11, iPad/iPad Mini, MagSafe, and More (Cable Not Included) Magnetic Wireless Charger, Anker Wireless Charger with 5ft Built-in USB-C Cable, PowerWave Magnetic Pad, 7.5W Charging for iPhone 13 / 13 Pro / 13 Pro Max / 13 mini / 12 / 12 Pro (No AC Adapter) USB C Super Fast Charger, Anker 25W PD Wall Charger Fast Charging for Samsung Galaxy S21/S21+/S21 Ultra/S20/Z Flip/Note20/20 Ultra/Note10/10+/S9/S8/S10e, iPad Pro 12.9, and More (Cable not Included) The price ranges from $8.99 to $59.99"
+ },
+ "intent_template_id": 204
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 281,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "instantiation_dict": {},
+ "intent": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage mention the highest price is $6.495?",
+ "answer": "Yes"
+ }
+ ],
+ "must_include": [
+ "Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count",
+ "Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "These models are availiable: Oral-B Kids Extra Soft Replacement Brush Heads featuring STAR WARS, 2 count Kids By Oral-b Stages Power Star Wars Replacement Heads 4 Pack The price ranges from $3.745 to $6.495 "
+ },
+ "intent_template_id": 204
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 282,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "instantiation_dict": {},
+ "intent": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage mention the highest price is $90.65?",
+ "answer": "Yes"
+ }
+ ],
+ "must_include": [
+ "Nike Men's Air Max Camden Slide Sandal",
+ "Nike Men's Benassi JDI Fanny Pack Slides",
+ "Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10)",
+ "Nike Offcourt Slide Mens Bq4639-002 Size 12",
+ "Nike Jordan Men's Break Slide Red AR6374-602",
+ "Nike Victori One Slide Mens Style : Dd9559-300",
+ "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14)",
+ "Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8)",
+ "Nike womens Benassi Just Do It"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "These models are availiable: Nike Men's Air Max Camden Slide Sandal Nike Men's Benassi JDI Fanny Pack Slides Nike Victori One Mens Comfort Slide Cn9675-003 (Midnight Navy/Midnight Navy/White, Numeric_10) Nike Offcourt Slide Mens Bq4639-002 Size 12 Nike Jordan Men's Break Slide Red AR6374-602 Nike Victori One Slide Mens Style : Dd9559-300 Nike Men's Benassi Solarsoft Slide Athletic Sandal (Black/White, numeric_14) Nike Men's Benassi Solarsoft Slide Athletic Sandal (Midnight Navy/Blue, numeric_8) Nike womens Benassi Just Do It. The price ranges from $27.6 to $90.65"
+ },
+ "intent_template_id": 204
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 283,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Look up the most recent models of XBox controllers released between 2020-2021?",
+ "instantiation_dict": {},
+ "intent": "Look up the most recent models of XBox controllers released between 2020-2021?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/microsoft-xbox-controller-carbon-black-for-series-x-series-s-xbox-one-windows-10-android-ios-bundled-with-dual-port-charging-dock-xbox-controller-skin-voucher-premgear-cloth.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 210
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 284,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.",
+ "instantiation_dict": {
+ "product": "shoe storage",
+ "min_storage": "12 pairs"
+ },
+ "intent": "Show the least expensive shoe storage with a minimum storage capacity of 12 pairs.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/onlyeasy-over-the-door-shoe-storage-organizer-hanging-shoe-rack-holder-with-24-large-fabric-pockets-22-1-x-61-4-herringbone-grey-mxrodsb1p.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 207
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 285,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.",
+ "instantiation_dict": {
+ "product": "switch card holder",
+ "min_storage": "15 cards"
+ },
+ "intent": "Show the least expensive switch card holder with a minimum storage capacity of 15 cards.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/game-card-holder-storage-case-for-nintendo-switch-games-or-ps-vita-game-case-or-sd-memory-cards-black.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 207
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 286,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the least expensive {{product}} with a minimum storage capacity of {{min_storage}}.",
+ "instantiation_dict": {
+ "product": "ssd hard drive",
+ "min_storage": "1TB"
+ },
+ "intent": "Show the least expensive ssd hard drive with a minimum storage capacity of 1TB.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/external-hard-drive-2tb-ultra-thin-external-hard-drive-2000gb-ultra-high-speed-portable-3-1-type-c-storage-drive-compatible-with-pc-laptop-and-mac-2tb-a1.html",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 207
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 287,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "instantiation_dict": {},
+ "intent": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "5h 47min"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "5h 47min"
+ },
+ "intent_template_id": 47
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 288,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history",
+ "instantiation_dict": {
+ "attribute": "name"
+ },
+ "intent": "Tell me the name of the customer who has the most cancellations in the history",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Samantha Jones"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Samantha Jones"
+ },
+ "intent_template_id": 234
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 289,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history",
+ "instantiation_dict": {
+ "attribute": "email address, name, phone number"
+ },
+ "intent": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "email: coolcat321@hotmail.com name: Samantha Jones phone number: 3055551212"
+ },
+ "intent_template_id": 234
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 290,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history",
+ "instantiation_dict": {
+ "attribute": "product SKUs in the most recent cancelled orders"
+ },
+ "intent": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "WSH09-29-White,WSH09-28-Green,MSH11-34-Blue,WP09-29-Purple"
+ },
+ "intent_template_id": 234
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 291,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history",
+ "instantiation_dict": {
+ "attribute": "total spend on products in the most recent cancelled orders"
+ },
+ "intent": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "148"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$148"
+ },
+ "intent_template_id": 234
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 292,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the customer who has the most cancellations in the history",
+ "instantiation_dict": {
+ "attribute": "total number of cancellations"
+ },
+ "intent": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "9"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "9"
+ },
+ "intent_template_id": 234
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 293,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Show me the command to clone {{repo}} with SSH.",
+ "instantiation_dict": {
+ "repo": "Super_Awesome_Robot"
+ },
+ "intent": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git"
+ },
+ "intent_template_id": 329
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 294,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Show me the command to clone {{repo}} with SSH.",
+ "instantiation_dict": {
+ "repo": "ChatGPT"
+ },
+ "intent": "Show me the command to clone ChatGPT with SSH.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git"
+ },
+ "intent_template_id": 329
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 295,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Show me the command to clone {{repo}} with SSH.",
+ "instantiation_dict": {
+ "repo": "metaseq"
+ },
+ "intent": "Show me the command to clone metaseq with SSH.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git"
+ },
+ "intent_template_id": 329
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 296,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Show me the command to clone {{repo}} with SSH.",
+ "instantiation_dict": {
+ "repo": "the best GAN python implementation"
+ },
+ "intent": "Show me the command to clone the best GAN python implementation with SSH.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git"
+ },
+ "intent_template_id": 329
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 297,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Show me the command to clone {{repo}} with SSH.",
+ "instantiation_dict": {
+ "repo": "the most stared Covid location tracker"
+ },
+ "intent": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git"
+ },
+ "intent_template_id": 329
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 298,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the most recent {{status}} order",
+ "instantiation_dict": {
+ "status": "completed"
+ },
+ "intent": "Show the most recent completed order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/sales/order/view/order_id/180/",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 180
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 299,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the most recent {{status}} order",
+ "instantiation_dict": {
+ "status": "cancelled"
+ },
+ "intent": "Show the most recent cancelled order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/sales/order/view/order_id/170/",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 180
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 300,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the most recent {{status}} order",
+ "instantiation_dict": {
+ "status": "pending"
+ },
+ "intent": "Show the most recent pending order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/sales/order/view/order_id/189/",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 180
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 301,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the most recent {{status}} order",
+ "instantiation_dict": {
+ "status": "processing"
+ },
+ "intent": "Show the most recent processing order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "there is no order in processing"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 180
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 302,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show the most recent {{status}} order",
+ "instantiation_dict": {
+ "status": "out of delivery"
+ },
+ "intent": "Show the most recent out of delivery order",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "there is no order out of delivery"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 180
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 303,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make {{period}}?",
+ "instantiation_dict": {
+ "user": "Kilian",
+ "period": "durning 2023"
+ },
+ "intent": "How many commits did Kilian make durning 2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1"
+ },
+ "intent_template_id": 321
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 304,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make {{period}}?",
+ "instantiation_dict": {
+ "user": "Eric",
+ "period": "between Feb 2023 and May 2023"
+ },
+ "intent": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "14"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "14"
+ },
+ "intent_template_id": 321
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 305,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make {{period}}?",
+ "instantiation_dict": {
+ "user": "Philip",
+ "period": "in 2023/1"
+ },
+ "intent": "How many commits did Philip make in 2023/1?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 321
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 306,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make {{period}}?",
+ "instantiation_dict": {
+ "user": "Anthony",
+ "period": "between 08/2022-09/2022"
+ },
+ "intent": "How many commits did Anthony make between 08/2022-09/2022?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 321
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 307,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "How many commits did {{user}} make {{period}}?",
+ "instantiation_dict": {
+ "user": "Nic",
+ "period": "in April 2021"
+ },
+ "intent": "How many commits did Nic make in April 2021?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "16"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "16"
+ },
+ "intent_template_id": 321
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 308,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project",
+ "instantiation_dict": {
+ "repo": "primer/design"
+ },
+ "intent": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Shawn Allen"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Shawn Allen"
+ },
+ "intent_template_id": 323
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 309,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project",
+ "instantiation_dict": {
+ "repo": "thoughtbot/administrate"
+ },
+ "intent": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Grayson Wright"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Grayson Wright"
+ },
+ "intent_template_id": 323
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 310,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project",
+ "instantiation_dict": {
+ "repo": "AndroidSlidingUpPanel"
+ },
+ "intent": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "tokudu"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "tokudu"
+ },
+ "intent_template_id": 323
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 311,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project",
+ "instantiation_dict": {
+ "repo": "Pytorch GAN"
+ },
+ "intent": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Erik Linder-Nor\u00e9n"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Erik Linder-Nor\u00e9n"
+ },
+ "intent_template_id": 323
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 312,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Tell me who has made the most contributions, in terms of number of commits, to the {{repo}} project",
+ "instantiation_dict": {
+ "repo": "csvkit"
+ },
+ "intent": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Christopher Groskopf"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Christopher Groskopf"
+ },
+ "intent_template_id": 323
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 313,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Which number to call for the customer service?",
+ "instantiation_dict": {},
+ "intent": "Which number to call for the customer service?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The website does not list the customer service phone number"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 134
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 314,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo.",
+ "instantiation_dict": {
+ "repo": "prime/design",
+ "attribute": "name"
+ },
+ "intent": "List the name of the top 3 contributors to prime/design repo.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Shawn Allen, Inayaili Le\u00f3n, Aurora Pleguezuelo"
+ },
+ "intent_template_id": 324
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 315,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo.",
+ "instantiation_dict": {
+ "repo": "Pytorch GAN",
+ "attribute": "email address"
+ },
+ "intent": "List the email address of the top 3 contributors to Pytorch GAN repo.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "must_include": [
+ "eriklindernoren@live.se",
+ "eriklindernoren@gmail.com",
+ "pinnacle.chen@qq.com"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "eriklindernoren@live.se, eriklindernoren@gmail.com, pinnacle.chen@qq.com"
+ },
+ "intent_template_id": 324
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 316,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo.",
+ "instantiation_dict": {
+ "repo": "facebook's guide on building react apps",
+ "attribute": "name"
+ },
+ "intent": "List the name of the top 3 contributors to facebook's guide on building react apps repo.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Ian Sutherland, Joe Hadda, Dan Abramov"
+ },
+ "intent_template_id": 324
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 317,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo.",
+ "instantiation_dict": {
+ "repo": "metaseq",
+ "attribute": "name and number of commits"
+ },
+ "intent": "List the name and number of commits of the top 3 contributors to metaseq repo.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Susan Zhang: 70, Stephen Roller: 51, Peter Albert: 12"
+ },
+ "intent_template_id": 324
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 318,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "List the {{attribute}} of the top 3 contributors to {{repo}} repo.",
+ "instantiation_dict": {
+ "repo": "2019-nCov",
+ "attribute": "last names"
+ },
+ "intent": "List the last names of the top 3 contributors to 2019-nCov repo.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lo, Chen, Chu"
+ },
+ "intent_template_id": 324
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 319,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much refund I should expect from my order cancelled in {{time}}, including shipping fee",
+ "instantiation_dict": {
+ "time": "April 2022"
+ },
+ "intent": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 160
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 320,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much refund I should expect from my order cancelled in {{time}}, including shipping fee",
+ "instantiation_dict": {
+ "time": "Feb 2023"
+ },
+ "intent": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$406.53"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$406.53"
+ },
+ "intent_template_id": 160
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 321,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much refund I should expect from my order cancelled in {{time}}, including shipping fee",
+ "instantiation_dict": {
+ "time": "2022"
+ },
+ "intent": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$3053.97"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$3053.97"
+ },
+ "intent_template_id": 160
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 322,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much refund I should expect from my order cancelled in {{time}} if I cannot get the shipping fee refunded?",
+ "instantiation_dict": {
+ "time": "May 2023"
+ },
+ "intent": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$350.42"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$350.42"
+ },
+ "intent_template_id": 160
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 323,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much refund I should expect from my order cancelled in {{time}}? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "instantiation_dict": {
+ "time": "2022/03"
+ },
+ "intent": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$264.49"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$264.49"
+ },
+ "intent_template_id": 160
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 324,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.",
+ "instantiation_dict": {
+ "product": "chairs",
+ "sorting_order": "ascending price"
+ },
+ "intent": "Show me the \"chairs\" listings by ascending price.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=chairs&product_list_dir=asc",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 208
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 325,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.",
+ "instantiation_dict": {
+ "product": "mouth night guard",
+ "sorting_order": "descending price"
+ },
+ "intent": "Show me the \"mouth night guard\" listings by descending price.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=mouth%20night%20guard%20&product_list_order=price",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 208
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 326,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.",
+ "instantiation_dict": {
+ "product": "Canon photo printer",
+ "sorting_order": "search relevance, from most to least"
+ },
+ "intent": "Show me the \"Canon photo printer\" listings by search relevance, from most to least.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/?q=Canon+photo+printer",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 208
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 327,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.",
+ "instantiation_dict": {
+ "product": "iphone 12 phone case",
+ "sorting_order": "name alphabetically"
+ },
+ "intent": "Show me the \"iphone 12 phone case\" listings by name alphabetically.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/index/?q=%20iphone%2012%20phone%20case&product_list_order=name",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 208
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 328,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the \"{{product}}\" listings by {{sorting_order}}.",
+ "instantiation_dict": {
+ "product": "iphone 12 phone case",
+ "sorting_order": "price"
+ },
+ "intent": "Show me the \"iphone 12 phone case\" listings by price.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/catalogsearch/result/index/?product_list_order=price&q=%20iphone%2012%20phone%20case",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 208
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 329,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spend {{time}} on shopping at One Stop Market?",
+ "instantiation_dict": {
+ "time": "on 4/19/2023"
+ },
+ "intent": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 147
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 330,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spend {{time}} on shopping at One Stop Market?",
+ "instantiation_dict": {
+ "time": "in March 2023"
+ },
+ "intent": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$81.31"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$81.31"
+ },
+ "intent_template_id": 147
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 331,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spend {{time}} on shopping at One Stop Market?",
+ "instantiation_dict": {
+ "time": "in July 2022"
+ },
+ "intent": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$40.16"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$40.16"
+ },
+ "intent_template_id": 147
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 332,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much I spend {{time}} on shopping at One Stop Market?",
+ "instantiation_dict": {
+ "time": "each month from Jan to the end of March 2023"
+ },
+ "intent": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Jan: 572.8\nFeb: 762.18\nMar: 83.31"
+ },
+ "intent_template_id": 147
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 333,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "How much did I spend on shopping at One Stop Market {{time}}? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "instantiation_dict": {
+ "time": "on November 2022"
+ },
+ "intent": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "$359.546"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "$359.546"
+ },
+ "intent_template_id": 147
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 334,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me when I last ordered my {{description}}?",
+ "instantiation_dict": {
+ "description": "muffin cornbread mix"
+ },
+ "intent": "Tell me when I last ordered my muffin cornbread mix?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "March 11th 2023"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "March 11th 2023"
+ },
+ "intent_template_id": 169
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 335,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me when I last ordered my {{description}}?",
+ "instantiation_dict": {
+ "description": "body butter"
+ },
+ "intent": "Tell me when I last ordered my body butter?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "January 16th 2023"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "January 16th 2023"
+ },
+ "intent_template_id": 169
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 336,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me when I last ordered my {{description}}?",
+ "instantiation_dict": {
+ "description": "conditioner"
+ },
+ "intent": "Tell me when I last ordered my conditioner?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "January 16th 2023"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "January 16th 2023"
+ },
+ "intent_template_id": 169
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 337,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me when I last ordered my {{description}}?",
+ "instantiation_dict": {
+ "description": "bread olive"
+ },
+ "intent": "Tell me when I last ordered my bread olive?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "December 12th 2022"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "December 12th 2022"
+ },
+ "intent_template_id": 169
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 338,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Tell me when I last ordered my {{description}}?",
+ "instantiation_dict": {
+ "description": "toothpaste"
+ },
+ "intent": "Tell me when I last ordered my toothpaste?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "December 4th 2022"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "December 4th 2022"
+ },
+ "intent_template_id": 169
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 339,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "List all opened issues {{description}}",
+ "instantiation_dict": {
+ "description": "that report bugs"
+ },
+ "intent": "List all opened issues that report bugs",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues/?label_name%5B%5D=bug",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 299
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 340,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "List all opened issues {{description}}",
+ "instantiation_dict": {
+ "description": "that report bugs"
+ },
+ "intent": "List all opened issues that report bugs",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/issues/?label_name%5B%5D=type%3A%20bug%20%F0%9F%90%9E",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 299
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 341,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/root/metaseq",
+ "geolocation": null,
+ "intent_template": "List all opened issues {{description}}",
+ "instantiation_dict": {
+ "description": "requesting new features"
+ },
+ "intent": "List all opened issues requesting new features",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=enhancement",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 299
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 342,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/root/metaseq",
+ "geolocation": null,
+ "intent_template": "List all opened issues {{description}}",
+ "instantiation_dict": {
+ "description": "that ask about OPT model related questions"
+ },
+ "intent": "List all opened issues that ask about OPT model related questions",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/root/metaseq/-/issues/?search=OPT&label_name%5B%5D=question",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 299
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 343,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/root/metaseq",
+ "geolocation": null,
+ "intent_template": "List all opened issues {{description}}",
+ "instantiation_dict": {
+ "description": "that don't have any labels"
+ },
+ "intent": "List all opened issues that don't have any labels",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/root/metaseq/-/issues/?label_name%5B%5D=None",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 299
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 344,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "How many reviews our shop received {{time}}?",
+ "instantiation_dict": {
+ "time": "so far"
+ },
+ "intent": "How many reviews our shop received so far?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "351"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "351"
+ },
+ "intent_template_id": 248
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 345,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "How many reviews our shop received {{time}}?",
+ "instantiation_dict": {
+ "time": "in Apr 2023"
+ },
+ "intent": "How many reviews our shop received in Apr 2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "351"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "351"
+ },
+ "intent_template_id": 248
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 346,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "How many reviews our shop received {{time}}?",
+ "instantiation_dict": {
+ "time": "during 2022"
+ },
+ "intent": "How many reviews our shop received during 2022?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 248
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 347,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "How many reviews our shop received {{time}}?",
+ "instantiation_dict": {
+ "time": "from the beginning of the shop"
+ },
+ "intent": "How many reviews our shop received from the beginning of the shop?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "351"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "351"
+ },
+ "intent_template_id": 248
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 348,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "How many reviews our shop received {{time}}?",
+ "instantiation_dict": {
+ "time": "in May 2023"
+ },
+ "intent": "How many reviews our shop received in May 2023?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 248
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 349,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Who else have access to my repo {{repo}}, show me their usernames",
+ "instantiation_dict": {
+ "repo": "gimmiethat.space"
+ },
+ "intent": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "yjlou"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "yjlou"
+ },
+ "intent_template_id": 298
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 350,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Who else have access to my repo {{repo}}, show me their usernames",
+ "instantiation_dict": {
+ "repo": "prism-theme"
+ },
+ "intent": "Who else have access to my repo prism-theme, show me their usernames",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "abisubramanya27"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Abishek S, abisubramanya27"
+ },
+ "intent_template_id": 298
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 351,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List products from {{product_category}} category by {{order}} price",
+ "instantiation_dict": {
+ "product_category": "PS4 accessories",
+ "order": "ascending"
+ },
+ "intent": "List products from PS4 accessories category by ascending price",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/video-games/playstation-4/accessories.html?product_list_order=price",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 137
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 352,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List products from {{product_category}} category by {{order}} price",
+ "instantiation_dict": {
+ "product_category": "nutrition bars and drinks",
+ "order": "ascending"
+ },
+ "intent": "List products from nutrition bars and drinks category by ascending price",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/health-household/diet-sports-nutrition/nutrition-bars-drinks.html?product_list_order=price",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 137
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 353,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List products from {{product_category}} category by {{order}} price",
+ "instantiation_dict": {
+ "product_category": "competitive swimwear",
+ "order": "ascending"
+ },
+ "intent": "List products from competitive swimwear category by ascending price",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/clothing-shoes-jewelry/sport-specific-clothing/competitive-swimwear.html?product_list_order=price",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 137
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 354,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List products from {{product_category}} category by {{order}} price",
+ "instantiation_dict": {
+ "product_category": "living room furtniture",
+ "order": "descending"
+ },
+ "intent": "List products from living room furtniture category by descending price",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/home-kitchen/furniture/living-room-furniture.html?product_list_order=price&product_list_dir=desc",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 137
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 355,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List products from {{product_category}} category by {{order}} price",
+ "instantiation_dict": {
+ "product_category": "kids' bedding",
+ "order": "descending"
+ },
+ "intent": "List products from kids' bedding category by descending price",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/home-kitchen/bedding/kids-bedding.html?product_list_dir=desc",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 137
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 356,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed",
+ "instantiation_dict": {},
+ "intent": "Show the route from SCS CMU in Pittsburgh to the location where the Declaration of Independence and Constitution were signed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Gates and Hillman Centers",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Independence Hall",
+ "Philadelphia"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 49
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 357,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Checkout merge requests requiring my review",
+ "instantiation_dict": {},
+ "intent": "Checkout merge requests requiring my review",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/dashboard/merge_requests?reviewer_username=byteblaze",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 291
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 358,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the {{info}} for order number {{order_number}}.",
+ "instantiation_dict": {
+ "info": "shipping method",
+ "order_number": 187
+ },
+ "intent": "Show me the shipping method for order number 187.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "Flat Rate - Fixed"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Flat Rate - Fixed"
+ },
+ "intent_template_id": 206
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 359,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the {{info}} for order number {{order_number}}.",
+ "instantiation_dict": {
+ "info": "order date",
+ "order_number": "148"
+ },
+ "intent": "Show me the order date for order number 148.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1/29/2023"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1/29/2023"
+ },
+ "intent_template_id": 206
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 360,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the {{info}} for order number {{order_number}}.",
+ "instantiation_dict": {
+ "info": "product names",
+ "order_number": "148"
+ },
+ "intent": "Show me the product names for order number 148.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress), Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ },
+ "intent_template_id": 206
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 361,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the {{info}} for order number {{order_number}}.",
+ "instantiation_dict": {
+ "info": "order statuses",
+ "order_number": "170 and 189"
+ },
+ "intent": "Show me the order statuses for order number 170 and 189.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "170: cancelled",
+ "189: pending"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "170: cancelled, 189: pending"
+ },
+ "intent_template_id": 206
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 362,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Show me the {{info}} for order number {{order_number}}.",
+ "instantiation_dict": {
+ "info": "billing address",
+ "order_number": "00178"
+ },
+ "intent": "Show me the billing address for order number 00178.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "context_qa": [
+ {
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Emma Lopez, 101 S San Mateo Dr, San Mateo, California, 94010, United States"
+ },
+ "intent_template_id": 206
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 363,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Mellon University",
+ "location/address_2": "Carnegie Music Hall"
+ },
+ "intent": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "748m"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "748m"
+ },
+ "intent_template_id": 58
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 364,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Mellon University",
+ "location/address_2": "UPMC Shadyside"
+ },
+ "intent": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1.7km"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1.7km"
+ },
+ "intent_template_id": 58
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 365,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Music Hall",
+ "location/address_2": "UPMC Shadyside"
+ },
+ "intent": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "2.2km"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "2.2km"
+ },
+ "intent_template_id": 58
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 366,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking",
+ "instantiation_dict": {
+ "location/address_1": "CVS (closet one)",
+ "location/address_2": "UPMC Shadyside"
+ },
+ "intent": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1.2km"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1.2km"
+ },
+ "intent_template_id": 58
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 367,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Measure distance between {{location/address_1}} and {{location/address_2}} by walking",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Mellon University",
+ "location/address_2": "CVS (closet one)"
+ },
+ "intent": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "1.4km"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "1.4km"
+ },
+ "intent_template_id": 58
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 368,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "find discounted items.",
+ "instantiation_dict": {},
+ "intent": "find discounted items.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no function to show only discount items"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "There is no function to show only discount items."
+ },
+ "intent_template_id": 188
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 369,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Pull up the description page of {{location}} on Map",
+ "instantiation_dict": {
+ "location": "Carnegie Music Hall"
+ },
+ "intent": "Pull up the description page of Carnegie Music Hall on Map",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Music Hall"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 52
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 370,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Pull up the description page of {{location}} on Map",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University"
+ },
+ "intent": "Pull up the description page of Carnegie Mellon University on Map",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 52
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 371,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Pull up the description page of {{location}} on Map",
+ "instantiation_dict": {
+ "location": "Piada restaurant near Pitt"
+ },
+ "intent": "Pull up the description page of Piada restaurant near Pitt on Map",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Piada Italian Street Food",
+ "Forbes Avenue"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 52
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 372,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Pull up the description page of {{location}} on Map",
+ "instantiation_dict": {
+ "location": "the Costco in Pittsburhg near a river"
+ },
+ "intent": "Pull up the description page of the Costco in Pittsburhg near a river on Map",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Costco",
+ "Waterfront Drive West"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 52
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 373,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Pull up the description page of {{location}} on Map",
+ "instantiation_dict": {
+ "location": "Whole Foods near Carnegie Mellon"
+ },
+ "intent": "Pull up the description page of Whole Foods near Carnegie Mellon on Map",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": null,
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Whole Foods",
+ "East Liberty"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 52
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 374,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Preview the {{name}} theme for my shop",
+ "instantiation_dict": {
+ "name": "Magento Blank"
+ },
+ "intent": "Preview the Magento Blank theme for my shop",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/1",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 266
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 375,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Preview the {{name}} theme for my shop",
+ "instantiation_dict": {
+ "name": "Magento Luma"
+ },
+ "intent": "Preview the Magento Luma theme for my shop",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/admin/system_design_theme/edit/id/3/key/",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 266
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 376,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Summarize customer reviews for {{product}}.",
+ "instantiation_dict": {
+ "product": "Amazon Echo Dot 3rd generation"
+ },
+ "intent": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "Amazon Echo Dot 3rd generation does not have any reviews."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 182
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 377,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the {{space}} around {{location}}",
+ "instantiation_dict": {
+ "location": "CMU ArtPark Lab",
+ "space": "resturants"
+ },
+ "intent": "Find the resturants around CMU ArtPark Lab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__MAP__/search?query=restaurants%20near%20CMU%20ArtPark%20Lab",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 59
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 378,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the {{space}} around {{location}}",
+ "instantiation_dict": {
+ "location": "CMU main campus",
+ "space": "parking"
+ },
+ "intent": "Find the parking around CMU main campus",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__MAP__/search?query=parking%20near%20carnegie%20mellon%20university",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 59
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 379,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the {{space}} around {{location}}",
+ "instantiation_dict": {
+ "location": "CMU main campus",
+ "space": "hotel"
+ },
+ "intent": "Find the hotel around CMU main campus",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__MAP__/search?query=hotels%20near%20carnegie%20mellon%20university",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 59
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 380,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the {{space}} around {{location}}",
+ "instantiation_dict": {
+ "location": "Carnegie Music Hall",
+ "space": "bar"
+ },
+ "intent": "Find the bar around Carnegie Music Hall",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__MAP__/search?query=bars%20near%20Carnegie%20Music%20Hall",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 59
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 381,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the {{space}} around {{location}}",
+ "instantiation_dict": {
+ "location": "Carnegie Music Hall",
+ "space": "hotel"
+ },
+ "intent": "Find the hotel around Carnegie Music Hall",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match"
+ ],
+ "reference_answers": null,
+ "reference_url": "__MAP__/search?query=hotels%20near%20Carnegie%20Music%20Hall",
+ "program_html": [],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 59
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 382,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "instantiation_dict": {},
+ "intent": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no USCIS nearby"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 781
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 383,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "instantiation_dict": {},
+ "intent": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Hyatt Regency Pittsburgh International Airport Giant Eagle, ALDI"
+ },
+ "intent_template_id": 782
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 384,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "instantiation_dict": {},
+ "intent": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lisa Lee, Evelyn Kurver, Amanda, N Randall"
+ },
+ "intent_template_id": 666
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 385,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "instantiation_dict": {},
+ "intent": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, Bethany Robertson"
+ },
+ "intent_template_id": 666
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 386,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "What is the rating of {{product}}",
+ "instantiation_dict": {
+ "product": "Ugreen lightning to 3.5mm cable"
+ },
+ "intent": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Around 3.1 stars"
+ },
+ "intent_template_id": 1355
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 387,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK",
+ "instantiation_dict": {
+ "stars": "4 or 5 stars"
+ },
+ "intent": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "MH, Misba009, Amanda, Amazon Customer, Cally, Bethany Robertson, Bethany Robertson"
+ },
+ "intent_template_id": 1356
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 388,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Who gave {{stars}} for phone cases from EYZUTAK",
+ "instantiation_dict": {
+ "stars": "1 or 2 stars"
+ },
+ "intent": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_must_include": [
+ "Evelyn Kurver",
+ "N Randall"
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Evelyn Kurver, N Randall"
+ },
+ "intent_template_id": 1356
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 389,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project",
+ "instantiation_dict": {
+ "mr": "octovisuals page",
+ "repo": "primer/design",
+ "content": "Thanks, working on reviews"
+ },
+ "intent": "Post \"Thanks, working on reviews\" for the merge request related to octovisuals page in primer/design project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/primer/design/-/merge_requests/450",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Thanks, working on reviews"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 348
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 390,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project",
+ "instantiation_dict": {
+ "mr": "semantic HTML post",
+ "repo": "a11yproject/a11yproject.com",
+ "content": "lgtm"
+ },
+ "intent": "Post \"lgtm\" for the merge request related to semantic HTML post in a11yproject/a11yproject.com project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1531",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "lgtm"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 348
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 391,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project",
+ "instantiation_dict": {
+ "mr": "focus edge cases",
+ "repo": "a11yproject/a11yproject.com",
+ "content": "close because non reproducible"
+ },
+ "intent": "Post \"close because non reproducible\" for the merge request related to focus edge cases in a11yproject/a11yproject.com project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1265",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "close because non reproducible"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 348
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 392,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project",
+ "instantiation_dict": {
+ "mr": "color ulitity",
+ "repo": "a11yproject.com",
+ "content": "Good idea"
+ },
+ "intent": "Post \"Good idea\" for the merge request related to color ulitity in a11yproject.com project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1071",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "Good idea"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 348
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 393,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Post \"{{content}}\" for the merge request related to {{mr}} in {{repo}} project",
+ "instantiation_dict": {
+ "mr": "fixing the broken links",
+ "repo": "byteblaze/empathy-prompts",
+ "content": "lgtm"
+ },
+ "intent": "Post \"lgtm\" for the merge request related to fixing the broken links in byteblaze/empathy-prompts project",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/empathy-prompts/-/merge_requests/19",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "lgtm"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 348
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 394,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "2019-nCov"
+ },
+ "intent": "Fork 2019-nCov.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/2019-nCov",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "2019-nCov"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 395,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "the Pytorch GAN repo with most stars"
+ },
+ "intent": "Fork the Pytorch GAN repo with most stars.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/PyTorch-GAN",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Pytorch-GAN"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 396,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "ChatGPT"
+ },
+ "intent": "Fork ChatGPT.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/ChatGPT",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "ChatGPT"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 397,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "MetaSeq"
+ },
+ "intent": "Fork MetaSeq.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/metaseq",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "metaseq"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 398,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "all source repos from Akilesh Kannan"
+ },
+ "intent": "Fork all source repos from Akilesh Kannan.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/SimCache",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "SimCache"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/dots",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "dots"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/CacheEval",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "CacheEval"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/nvidia-patch",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "404"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/viewgrades-scraper",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "404"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 399,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Change my reddit bio to \"{{content}}\"",
+ "instantiation_dict": {
+ "content": "I am a robot"
+ },
+ "intent": "Change my reddit bio to \"I am a robot\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/user/MarvelsGrantMan136",
+ "locator": "document.querySelector(\".user-bio__biography\").outerText",
+ "required_contents": {
+ "exact_match": "I am a robot"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 6
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 400,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Change my reddit bio to \"{{content}}\"",
+ "instantiation_dict": {
+ "content": "Pro Python Developer with 20 years of Experience"
+ },
+ "intent": "Change my reddit bio to \"Pro Python Developer with 20 years of Experience\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/user/MarvelsGrantMan136",
+ "locator": "document.querySelector(\".user-bio__biography\").outerText",
+ "required_contents": {
+ "exact_match": "Pro Python Developer with 20 years of Experience"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 6
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 401,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Change my reddit bio to \"{{content}}\"",
+ "instantiation_dict": {
+ "content": "Seeking SDE positions"
+ },
+ "intent": "Change my reddit bio to \"Seeking SDE positions\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/user/MarvelsGrantMan136",
+ "locator": "document.querySelector(\".user-bio__biography\").outerText",
+ "required_contents": {
+ "exact_match": "Seeking SDE positions"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 6
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 402,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Change my reddit bio to \"{{content}}\"",
+ "instantiation_dict": {
+ "content": "Freelance Web Developer"
+ },
+ "intent": "Change my reddit bio to \"Freelance Web Developer\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/user/MarvelsGrantMan136",
+ "locator": "document.querySelector(\".user-bio__biography\").outerText",
+ "required_contents": {
+ "exact_match": "Freelance Web Developer"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 6
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 403,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Change my reddit bio to \"{{content}}\"",
+ "instantiation_dict": {
+ "content": "Awesome Prompt Artist"
+ },
+ "intent": "Change my reddit bio to \"Awesome Prompt Artist\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/user/MarvelsGrantMan136",
+ "locator": "document.querySelector(\".user-bio__biography\").outerText",
+ "required_contents": {
+ "exact_match": "Awesome Prompt Artist"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 6
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 404,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Upvote the newest post in {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "subreddit": "books"
+ },
+ "intent": "Upvote the newest post in books subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/books/124260/adults-reading-to-each-other-out-loud",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 22
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 405,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Upvote the newest post in {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "subreddit": "DIY"
+ },
+ "intent": "Upvote the newest post in DIY subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/DIY/119019/how-can-i-bring-an-hdmi-cable-from-my-pc-downstairs-to-my-tv",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 22
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 406,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Upvote the newest post in {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "subreddit": "future technology"
+ },
+ "intent": "Upvote the newest post in future technology subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/Futurology/119517/openai-ceo-it-s-not-funny-that-i-m-afraid-of-the-ai-we-re",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 22
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 407,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Upvote the newest post in {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "subreddit": "deeplearning"
+ },
+ "intent": "Upvote the newest post in deeplearning subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/deeplearning/124993/meta-s-llama-weights-leaked-on-torrent-and-the-best-thing",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 22
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 408,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Upvote the newest post in {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "subreddit": "explain like im 5"
+ },
+ "intent": "Upvote the newest post in explain like im 5 subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/explainlikeimfive/39244/eli5-how-does-pinching-a-ribbon-and-sliding-your-finger",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 22
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 409,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists",
+ "geolocation": null,
+ "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"",
+ "instantiation_dict": {
+ "position_description": "the manager of the website",
+ "content_description": "thanks! I am a big fan of your website."
+ },
+ "intent": "Reply to the manager of the website in this post with \"thanks! I am a big fan of your website.\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/books/59421/-/comment/1235250",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "thanks! I am a big fan of your website."
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 23
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 410,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/singularity/69404/this-is-how-chatgpt-sees-itself",
+ "geolocation": null,
+ "intent_template": "Reply to {{position_description}} in this post with \"{{content_description}}\"",
+ "instantiation_dict": {
+ "position_description": "the first reply",
+ "content_description": "don't panic"
+ },
+ "intent": "Reply to the first reply in this post with \"don't panic\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/singularity/69404/-/comment/1042264",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "don't panic"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 23
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 411,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make the LICENSE of {{repo}} to MIT license.",
+ "instantiation_dict": {
+ "repo": "byteblaze/cloud-to-butt"
+ },
+ "intent": "Make the LICENSE of byteblaze/cloud-to-butt to MIT license.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/cloud-to-butt/-/blob/master/LICENSE.txt",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "MIT license"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 355
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 412,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make the LICENSE of {{repo}} to Apache License",
+ "instantiation_dict": {
+ "repo": "byteblaze/accessible-html-content-patterns"
+ },
+ "intent": "Make the LICENSE of byteblaze/accessible-html-content-patterns to Apache License",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/blob/main/LICENSE",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Apache License",
+ "http://www.apache.org/licenses/LICENSE-2.0"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 355
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 413,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make the LICENSE of {{repo}} to one that mandates all copies and derivative works to be under the same license",
+ "instantiation_dict": {
+ "repo": "byteblaze/a11y-syntax-highlighting"
+ },
+ "intent": "Make the LICENSE of byteblaze/a11y-syntax-highlighting to one that mandates all copies and derivative works to be under the same license",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/blob/main/LICENSE",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "GENERAL PUBLIC LICENSE"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 355
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 414,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make the LICENSE of {{repo}} to MIT license.",
+ "instantiation_dict": {
+ "repo": "byteblaze/dotfiles"
+ },
+ "intent": "Make the LICENSE of byteblaze/dotfiles to MIT license.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "MIT license",
+ "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software."
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 355
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 415,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "instantiation_dict": {
+ "topic": "verification functions"
+ },
+ "intent": "Go to the merge request on verification functions I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/a11y-webring.club/-/merge_requests/40",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "@davepgreene"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 360
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 416,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "instantiation_dict": {
+ "topic": "wcag"
+ },
+ "intent": "Go to the merge request on wcag I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1270",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "Thank you"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 360
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 417,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Go to the merge request on {{topic}} I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "instantiation_dict": {
+ "topic": "404 link"
+ },
+ "intent": "Go to the merge request on 404 link I have to review, find if the author of the merge request responded at the end, and reply \"Thank you\" if he did. Otherwise remind him with a simple @.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests/1485",
+ "locator": "document.querySelector('[id=\"notes-list\"').lastElementChild.querySelector('.timeline-discussion-body').outerText",
+ "required_contents": {
+ "exact_match": "@Roshanjossey"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 360
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 418,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set my gitlab status as {{status}}.",
+ "instantiation_dict": {
+ "status": "Busy"
+ },
+ "intent": "Set my gitlab status as Busy.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.cover-status').lastChild.textContent",
+ "required_contents": {
+ "exact_match": "Busy"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 361
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 419,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set my gitlab status as {{status}}.",
+ "instantiation_dict": {
+ "status": "Enjoying life"
+ },
+ "intent": "Set my gitlab status as Enjoying life.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.cover-status').lastChild.textContent",
+ "required_contents": {
+ "exact_match": "Enjoying life"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 361
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 420,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set my gitlab status as {{status}}.",
+ "instantiation_dict": {
+ "status": "Playing Badminton"
+ },
+ "intent": "Set my gitlab status as Playing Badminton.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.cover-status').lastChild.textContent",
+ "required_contents": {
+ "exact_match": "Playing Badminton"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 361
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 421,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set my gitlab status as {{status}}.",
+ "instantiation_dict": {
+ "status": "Resting due to leg injury"
+ },
+ "intent": "Set my gitlab status as Resting due to leg injury.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.cover-status').lastChild.textContent",
+ "required_contents": {
+ "exact_match": "Resting due to leg injury"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 361
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 422,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set my gitlab status as {{status}}.",
+ "instantiation_dict": {
+ "status": "Out of Office"
+ },
+ "intent": "Set my gitlab status as Out of Office.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.cover-status').lastChild.textContent",
+ "required_contents": {
+ "exact_match": "Out of Office"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 361
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 423,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Mark all {{brand}} shirts on sale",
+ "instantiation_dict": {
+ "brand": "Hollister"
+ },
+ "intent": "Mark all Hollister shirts on sale",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/126/",
+ "locator": "document.querySelector('input[name=\"product[sale]\"]').value",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 237
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 424,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the place where Mr. Rogers was filmed"
+ },
+ "intent": "Find the page of the place where Mr. Rogers was filmed on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Pittsburgh"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 425,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the longest bridge between anchorages in the Western hemisphere"
+ },
+ "intent": "Find the page of the longest bridge between anchorages in the Western hemisphere on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Mackinac Bridge"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 426,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the place in Pennsylvania where a plane crashed during the September 11th attacks"
+ },
+ "intent": "Find the page of the place in Pennsylvania where a plane crashed during the September 11th attacks on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Somerset County"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 427,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the university that has most Turning Award winners"
+ },
+ "intent": "Find the page of the university that has most Turning Award winners on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Massachusetts Institute of Technology"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 428,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the undergrad college of the person who developed the Nash equilibrium"
+ },
+ "intent": "Find the page of the undergrad college of the person who developed the Nash equilibrium on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 429,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the colleges where The Chair was filmed in Pittsburgh"
+ },
+ "intent": "Find the page of the colleges where The Chair was filmed in Pittsburgh on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Chatham University"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 430,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the page of {{description}} on the map.",
+ "instantiation_dict": {
+ "description": "the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh"
+ },
+ "intent": "Find the page of the college(s) where The Chair was filmed in Pennsylvania other than the ones in Pittsburgh on the map.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sidebar_content\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Washington & Jefferson College"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 371
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 431,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/tall-pink-taper-candles-4-piece-orange-colored-tapered-candles-gradient-candles-10-6-inches-tall-tie-dye-candle-set-large-dripless-long-burning-candlesticks-two-color-taper-candles-candlesticks.html |AND| __SHOPPING__/spaas-white-taper-candles-4-pack-10-inch-tall-candles-scent-free-premium-wax-candle-sticks-8-hour-long-burning-white-candlesticks-for-home-decoration-wedding-holiday-and-parties.html |AND| __SHOPPING__/white-starfish-wall-candle-sconces-set-of-2-beach-decor-ocean-themed-wall-mount-candleholders-nautical-style-beach-bathroom-decor-coastal-farmhouse-seashell-candle-holders.html",
+ "geolocation": null,
+ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "instantiation_dict": {},
+ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/checkout/cart",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "SPAAS White Taper Candles - 4 Pack |OR| 10 Inch Tall Candles, Scent-Free Premium Wax Candle Sticks |OR| 8 Hour Long Burning White Candlesticks for Home Decoration, Wedding, Holiday and Parties"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 145
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 432,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/ciclon-energy-drink-regular-24-cans-8-3oz.html |AND| __SHOPPING__/v8-energy-healthy-energy-drink-steady-energy-from-black-and-green-tea-pomegranate-blueberry-8-ounce-can-pack-of-24.html",
+ "geolocation": null,
+ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "instantiation_dict": {},
+ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/checkout/cart",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "V8 +Energy, Healthy Energy Drink, Steady Energy from Black and Green Tea, Pomegranate Blueberry, 8 Ounce Can ,Pack of 24"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 145
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 433,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/tazrigo-5pcs-white-dental-resin-brush-pens-dental-shaping-silicone-tooth-tool.html |AND| __SHOPPING__/stylus-pens-for-touch-screens-2-pcs-universal-stylus-2-in-1-2022-updated-touch-screen-pens-for-all-touch-screens-cell-phones-tablets-laptops-with-6-replacement-tips-4-discstips-2-fiber-tips.html",
+ "geolocation": null,
+ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "instantiation_dict": {},
+ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/checkout/cart",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tazrigo 5pcs White Dental Resin Brush Pens Dental Shaping Silicone Tooth Tool"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 145
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 434,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/3-pairs-ruffle-socks-lace-ankle-socks-for-girls-frilly-socks-women-decorative.html |AND| __SHOPPING__/viviki-women-glitter-socks-ultrathin-transparent-tulle-lace-socks-no-show-ankle-crew-socks-3-pack.html",
+ "geolocation": null,
+ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "instantiation_dict": {},
+ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/checkout/cart",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "VIVIKI Women Glitter Socks Ultrathin Transparent Tulle Lace Socks - No Show Ankle Crew Socks 3 Pack"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 145
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 435,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/35-ft-hdmi-cable-gearit-pro-series-hdmi-cable-35-feet-high-speed-ethernet-4k-resolution-3d-video-and-arc-audio-return-channel-hdmi-cable-white.html |AND| __SHOPPING__/dp-to-hdmi-cable-6ft-2-pack-fosmon-gold-plated-displayport-to-hdmi-cable-1080p-full-hd-for-pcs-to-hdtv-monitor-projector-with-hdmi-port.html",
+ "geolocation": null,
+ "intent_template": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "instantiation_dict": {},
+ "intent": "Add the product with the lowest per unit price from my open tabs to the shopping cart",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/checkout/cart",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "DP to HDMI Cable 6FT (2 Pack), Fosmon Gold Plated Displayport to HDMI Cable 1080p Full HD for PCs to HDTV, Monitor, Projector with HDMI Port"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 145
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 436,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?",
+ "instantiation_dict": {
+ "product": "a mattress foundation",
+ "time": "around Feb or March 2023"
+ },
+ "intent": "I previously ordered some a mattress foundation around Feb or March 2023 and later cancelled. Can you reorder it for me?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B07DFJ5XKH"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 156
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 437,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?",
+ "instantiation_dict": {
+ "product": "a table lamp",
+ "time": "in May 2023"
+ },
+ "intent": "I previously ordered some a table lamp in May 2023 and later cancelled. Can you reorder it for me?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B072XS3F6W"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 156
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 438,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?",
+ "instantiation_dict": {
+ "product": "a TV stand",
+ "time": "sometime around sep 2022"
+ },
+ "intent": "I previously ordered some a TV stand sometime around sep 2022 and later cancelled. Can you reorder it for me?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B08PVHRRB7"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 156
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 439,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?",
+ "instantiation_dict": {
+ "product": "a cat t-shirt",
+ "time": "during 2022"
+ },
+ "intent": "I previously ordered some a cat t-shirt during 2022 and later cancelled. Can you reorder it for me?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B0844BWS76"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 156
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 440,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I previously ordered some {{product}} {{time}} and later cancelled. Can you reorder it for me?",
+ "instantiation_dict": {
+ "product": "a make up removal kit",
+ "time": "during summer 2022"
+ },
+ "intent": "I previously ordered some a make up removal kit during summer 2022 and later cancelled. Can you reorder it for me?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B0738JQG6Q"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 156
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 441,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/gimmiethat.space",
+ "geolocation": null,
+ "intent_template": "Update the project site's title to \"{{title}}\"",
+ "instantiation_dict": {
+ "title": "GIVE ME SPACE"
+ },
+ "intent": "Update the project site's title to \"GIVE ME SPACE\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "GIVE ME SPACE"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 308
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 442,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/gimmiethat.space",
+ "geolocation": null,
+ "intent_template": "Update the project site's title to \"{{title}}\"",
+ "instantiation_dict": {
+ "title": "Welcome to my site"
+ },
+ "intent": "Update the project site's title to \"Welcome to my site\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Welcome to my site"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 308
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 443,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/gimmiethat.space",
+ "geolocation": null,
+ "intent_template": "Update the project site's title to \"{{title}}\"",
+ "instantiation_dict": {
+ "title": "Not an interesting site"
+ },
+ "intent": "Update the project site's title to \"Not an interesting site\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Not an interesting site"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 308
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 444,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/gimmiethat.space",
+ "geolocation": null,
+ "intent_template": "Update the project site's title to \"{{title}}\"",
+ "instantiation_dict": {
+ "title": "Title Wanted"
+ },
+ "intent": "Update the project site's title to \"Title Wanted\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Title Wanted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 308
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 445,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/gimmiethat.space",
+ "geolocation": null,
+ "intent_template": "Update the project site's title to \"{{title}}\"",
+ "instantiation_dict": {
+ "title": "Hello"
+ },
+ "intent": "Update the project site's title to \"Hello\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/index.html",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Hello"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 308
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 446,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.",
+ "instantiation_dict": {
+ "repo": "a11yproject",
+ "issue": 404,
+ "account": "Roshanjossey"
+ },
+ "intent": "Assign the issue regarding 404 in a11yproject to Roshanjossey.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Roshanjossey",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "404s, bad host, timeouts, bad urls for URLs linked from website"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 999
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 447,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.",
+ "instantiation_dict": {
+ "repo": "a11y-webring.club",
+ "issue": "linking to an accessibility statement",
+ "account": "Rohan"
+ },
+ "intent": "Assign the issue regarding linking to an accessibility statement in a11y-webring.club to Rohan.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=Seirdy",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "linking to an accessibility statement"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 999
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 448,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "set the homepage URL on my GitLab profile to {{url}}",
+ "instantiation_dict": {
+ "url": "https://egg.tart.com"
+ },
+ "intent": "set the homepage URL on my GitLab profile to https://egg.tart.com",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText",
+ "required_contents": {
+ "exact_match": "egg.tart.com"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 331
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 449,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "set the homepage URL on my GitLab profile to {{url}}",
+ "instantiation_dict": {
+ "url": "https://helloworld.xyz"
+ },
+ "intent": "set the homepage URL on my GitLab profile to https://helloworld.xyz",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText",
+ "required_contents": {
+ "exact_match": "helloworld.xyz"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 331
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 450,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "set the homepage URL on my GitLab profile to {{url}}",
+ "instantiation_dict": {
+ "url": "a11yproject.contributor.me"
+ },
+ "intent": "set the homepage URL on my GitLab profile to a11yproject.contributor.me",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText",
+ "required_contents": {
+ "exact_match": "a11yproject.contributor.me"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 331
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 451,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "set the homepage URL on my GitLab profile to {{url}}",
+ "instantiation_dict": {
+ "url": "www.byteblaze.com"
+ },
+ "intent": "set the homepage URL on my GitLab profile to www.byteblaze.com",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText",
+ "required_contents": {
+ "exact_match": "www.byteblaze.com"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 331
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 452,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "set the homepage URL on my GitLab profile to {{url}}",
+ "instantiation_dict": {
+ "url": "byteblaze.github.io"
+ },
+ "intent": "set the homepage URL on my GitLab profile to byteblaze.github.io",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze",
+ "locator": "document.querySelector('.profile-header [itemprop=\"url\"]').outerText",
+ "required_contents": {
+ "exact_match": "byteblaze.github.io"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 331
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 453,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.",
+ "instantiation_dict": {
+ "product": "Teton pullover hoodie"
+ },
+ "intent": "Disable Teton pullover hoodie from the site, they are facing some quality issues.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/78/",
+ "locator": "document.querySelector('[name=\"product[status]\"').value",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 242
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 454,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.",
+ "instantiation_dict": {
+ "product": "Ryker Tee Crew Neck"
+ },
+ "intent": "Disable Ryker Tee Crew Neck from the site, they are facing some quality issues.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/478/",
+ "locator": "document.querySelector('[name=\"product[status]\"').value",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 242
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 455,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.",
+ "instantiation_dict": {
+ "product": "lHelios Endurance Tank"
+ },
+ "intent": "Disable lHelios Endurance Tank from the site, they are facing some quality issues.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/676/",
+ "locator": "document.querySelector('[name=\"product[status]\"').value",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 242
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 456,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.",
+ "instantiation_dict": {
+ "product": "Cora Pant"
+ },
+ "intent": "Disable Cora Pant from the site, they are facing some quality issues.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1840/",
+ "locator": "document.querySelector('[name=\"product[status]\"').value",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 242
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 457,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Disable {{product}} from the site, they are facing some quality issues.",
+ "instantiation_dict": {
+ "product": "Karmen yoga pants"
+ },
+ "intent": "Disable Karmen yoga pants from the site, they are facing some quality issues.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1819/",
+ "locator": "document.querySelector('[name=\"product[status]\"').value",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 242
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 458,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "$5",
+ "action": "Reduce"
+ },
+ "intent": "Reduce the price of this product by $5",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "27.00"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 459,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "10%",
+ "action": "Reduce"
+ },
+ "intent": "Reduce the price of this product by 10%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/237/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "62.10"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 460,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "15%",
+ "action": "Reduce"
+ },
+ "intent": "Reduce the price of this product by 15%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/418/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "38.25"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 461,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "$11.5",
+ "action": "Increase"
+ },
+ "intent": "Increase the price of this product by $11.5",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/721/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "29.50"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 462,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "10%",
+ "action": "Increase"
+ },
+ "intent": "Increase the price of this product by 10%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/710/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "19.80"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 463,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of this product by {{amount}}",
+ "instantiation_dict": {
+ "amount": "15%",
+ "action": "Increase"
+ },
+ "intent": "Increase the price of this product by 15%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/996/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "36.80"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 247
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 464,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments",
+ "instantiation_dict": {
+ "product": "Antonia Racer Tank"
+ },
+ "intent": "Update the product description of Antonia Racer Tank to highlight the real user positive reviews by quoting the comments",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/../antonia-racer-tank.html",
+ "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')",
+ "required_contents": {
+ "must_include": [
+ "This is in regular rotation at the gym",
+ "Its colorful and looks kinda cute under my exercise tanks",
+ "it's very stylish for yoga or something else low impact"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 251
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 465,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add {{product}} to my wish list",
+ "instantiation_dict": {
+ "product": "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count"
+ },
+ "intent": "Add Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count to my wish list",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Tide PODS Spring Meadow Scent HE Turbo Laundry Detergent Pacs, 81 Count"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 186
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 466,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add {{product}} to my wish list",
+ "instantiation_dict": {
+ "product": "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"
+ },
+ "intent": "Add 2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts to my wish list",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "2 Hawaiian Bamboo Orchid Roots #zc50 - by Discount Hawaiian Gifts"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 186
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 467,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add {{product}} to my wish list",
+ "instantiation_dict": {
+ "product": "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits"
+ },
+ "intent": "Add HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits to my wish list",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "HONGJ Hawaiian Beach Outfits Set for Mens, Summer Tropical Tree Printed Relaxed-fit Hawaii Shirts Shorts 2 Piece Suits"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 186
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 468,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add {{product}} to my wish list",
+ "instantiation_dict": {
+ "product": "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit"
+ },
+ "intent": "Add DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit to my wish list",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "DkRgVNY Lace Spcling Lingerie Womens Sexy Hollow Out Underwear Bodysuit One Piece Snap Crotch Clubwear Teddy Bodysuit"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 186
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 469,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add {{product}} to my wish list",
+ "instantiation_dict": {
+ "product": "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes"
+ },
+ "intent": "Add Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes to my wish list",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Light Blue Simple Summer New Low Heels Slippers for Women Fashion Chunky Heels Pointed Toe Wine Glasses Sandals Comfortable Walking Shoes Ladies All-Match Sexy Party Shoes"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 186
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 470,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Cancel order {{id}}",
+ "instantiation_dict": {
+ "id": "302"
+ },
+ "intent": "Cancel order 302",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/",
+ "locator": "document.querySelector(\"#order_status\").outerText",
+ "required_contents": {
+ "exact_match": "Canceled"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 257
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 471,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Cancel order {{id}}",
+ "instantiation_dict": {
+ "id": "307"
+ },
+ "intent": "Cancel order 307",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/",
+ "locator": "document.querySelector(\"#order_status\").outerText",
+ "required_contents": {
+ "exact_match": "Canceled"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 257
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 472,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Cancel order {{id}}",
+ "instantiation_dict": {
+ "id": "299"
+ },
+ "intent": "Cancel order 299",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299/",
+ "locator": "document.querySelector(\"#order_status\").outerText",
+ "required_contents": {
+ "exact_match": "Canceled"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 257
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 473,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Cancel order {{id}}",
+ "instantiation_dict": {
+ "id": "301"
+ },
+ "intent": "Cancel order 301",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301/",
+ "locator": "document.querySelector(\"#order_status\").outerText",
+ "required_contents": {
+ "exact_match": "Canceled"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 257
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 474,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Cancel order {{id}}",
+ "instantiation_dict": {
+ "id": "305"
+ },
+ "intent": "Cancel order 305",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/305/",
+ "locator": "document.querySelector(\"#order_status\").outerText",
+ "required_contents": {
+ "exact_match": "Canceled"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 257
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 475,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set up a new, empty repository with the name {{project_name}}?",
+ "instantiation_dict": {
+ "project_name": "chatgpt_plugin"
+ },
+ "intent": "Set up a new, empty repository with the name chatgpt_plugin?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/chatgpt_plugin",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "chatgpt_plugin"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 292
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 476,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set up a new, empty repository with the name {{project_name}}?",
+ "instantiation_dict": {
+ "project_name": "awesome_llm_reading"
+ },
+ "intent": "Set up a new, empty repository with the name awesome_llm_reading?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/awesome_llm_reading",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "awesome_llm_reading"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 292
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 477,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set up a new, empty repository with the name {{project_name}}?",
+ "instantiation_dict": {
+ "project_name": "awesome_program_aided_reasoning"
+ },
+ "intent": "Set up a new, empty repository with the name awesome_program_aided_reasoning?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/awesome_program_aided_reasoning",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "awesome_program_aided_reasoning"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 292
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 478,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set up a new, empty repository with the name {{project_name}}?",
+ "instantiation_dict": {
+ "project_name": "webagent"
+ },
+ "intent": "Set up a new, empty repository with the name webagent?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/webagent",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "webagent"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 292
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 479,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Set up a new, empty repository with the name {{project_name}}?",
+ "instantiation_dict": {
+ "project_name": "awesome_webagent"
+ },
+ "intent": "Set up a new, empty repository with the name awesome_webagent?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/awesome_webagent",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "awesome_webagent"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 292
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 480,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}}",
+ "instantiation_dict": {
+ "collaborator_account_list": "yjlou",
+ "repo": "solarized-prism-theme"
+ },
+ "intent": "Invite yjlou as collaborator to solarized-prism-theme",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/solarized-prism-theme/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "yjlou"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 293
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 481,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "instantiation_dict": {
+ "name": "Abishek"
+ },
+ "intent": "Abishek wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 294
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 482,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "instantiation_dict": {
+ "name": "yjlou"
+ },
+ "intent": "yjlou wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 294
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 483,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "instantiation_dict": {
+ "name": "Koushik"
+ },
+ "intent": "Koushik wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'koush')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 294
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 484,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "instantiation_dict": {
+ "name": "Jakub Klinkovsk\u00fd"
+ },
+ "intent": "Jakub Klinkovsk\u00fd wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 294
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 485,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "{{name}} wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "instantiation_dict": {
+ "name": "Vinta"
+ },
+ "intent": "Vinta wants to check my dotfile configurations. Please invite him to the repo as a guest.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'vinta')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 294
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 486,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".",
+ "instantiation_dict": {
+ "old-heading": "404 Not Found",
+ "heading": "Bruh bro you clicked the wrong page"
+ },
+ "intent": "Change the page title of \"404 Not Found\" page on my site to \"Bruh bro you clicked the wrong page\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/1/",
+ "locator": "document.querySelector('input[name=\"title\"').value",
+ "required_contents": {
+ "exact_match": "Bruh bro you clicked the wrong page"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 275
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 487,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".",
+ "instantiation_dict": {
+ "old-heading": "Enable Cookies",
+ "heading": "Cookie monster coming to your place"
+ },
+ "intent": "Change the page title of \"Enable Cookies\" page on my site to \"Cookie monster coming to your place\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/3/",
+ "locator": "document.querySelector('input[name=\"title\"').value",
+ "required_contents": {
+ "exact_match": "Cookie monster coming to your place"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 275
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 488,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".",
+ "instantiation_dict": {
+ "old-heading": "Home Page",
+ "heading": "This is the home page!! Leave here!!"
+ },
+ "intent": "Change the page title of \"Home Page\" page on my site to \"This is the home page!! Leave here!!\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/2/",
+ "locator": "document.querySelector('input[name=\"title\"').value",
+ "required_contents": {
+ "exact_match": "This is the home page!! Leave here!!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 275
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 489,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".",
+ "instantiation_dict": {
+ "old-heading": "Privacy Policy",
+ "heading": "No privacy policy is needed is this dystopian world"
+ },
+ "intent": "Change the page title of \"Privacy Policy\" page on my site to \"No privacy policy is needed is this dystopian world\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/4/",
+ "locator": "document.querySelector('input[name=\"title\"').value",
+ "required_contents": {
+ "exact_match": "No privacy policy is needed is this dystopian world"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 275
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 490,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Change the page title of \"{{old-heading}}\" page on my site to \"{{heading}}\".",
+ "instantiation_dict": {
+ "old-heading": "About us",
+ "heading": "Secret"
+ },
+ "intent": "Change the page title of \"About us\" page on my site to \"Secret\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/cms/page/edit/page_id/5/",
+ "locator": "document.querySelector('input[name=\"title\"').value",
+ "required_contents": {
+ "exact_match": "Secret"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 275
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 491,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"",
+ "instantiation_dict": {
+ "name": "Sarah Miller",
+ "message": "the order is ready to be shipped soon!"
+ },
+ "intent": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": ""
+ },
+ "intent_template_id": 280
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 492,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"",
+ "instantiation_dict": {
+ "name": "Jane Doe",
+ "message": "sorry we are out of stock, please reorder"
+ },
+ "intent": "Notify Jane Doe in their most recent pending order with message \"sorry we are out of stock, please reorder\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/302/",
+ "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText",
+ "required_contents": {
+ "exact_match": "sorry we are out of stock, please reorder"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 280
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 493,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"",
+ "instantiation_dict": {
+ "name": "Grace Nguyen",
+ "message": "sorry we are bankrupt, please contact our customer service for refund"
+ },
+ "intent": "Notify Grace Nguyen in their most recent pending order with message \"sorry we are bankrupt, please contact our customer service for refund\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/307/",
+ "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText",
+ "required_contents": {
+ "exact_match": "sorry we are bankrupt, please contact our customer service for refund"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 280
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 494,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"",
+ "instantiation_dict": {
+ "name": "Alex Thomas",
+ "message": "Yo, your order will be shipped soon!"
+ },
+ "intent": "Notify Alex Thomas in their most recent pending order with message \"Yo, your order will be shipped soon!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/304/",
+ "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText",
+ "required_contents": {
+ "exact_match": "Yo, your order will be shipped soon!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 280
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 495,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Notify {{name}} in their most recent pending order with message \"{{message}}\"",
+ "instantiation_dict": {
+ "name": "Lily Potter",
+ "message": "Thanks, your order is ready to be shipped!"
+ },
+ "intent": "Notify Lily Potter in their most recent pending order with message \"Thanks, your order is ready to be shipped!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/303/",
+ "locator": "document.querySelector(\"#order_history_block\").querySelector(\".note-list\").firstElementChild.querySelector(\".note-list-comment\").outerText",
+ "required_contents": {
+ "exact_match": "Thanks, your order is ready to be shipped!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 280
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 496,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}",
+ "instantiation_dict": {
+ "tracking": "8974568499",
+ "order": "299",
+ "service": "Federal Express"
+ },
+ "intent": "Update order #299 with the Federal Express tracking number 8974568499",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/299/active_tab/order_shipments/",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tracking number 8974568499 for Federal Express assigned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 284
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 497,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}",
+ "instantiation_dict": {
+ "tracking": "24353446464",
+ "order": "307",
+ "service": "DHL"
+ },
+ "intent": "Update order #307 with the DHL tracking number 24353446464",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/307/active_tab/order_shipments/",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tracking number 24353446464 for DHL assigned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 284
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 498,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}",
+ "instantiation_dict": {
+ "tracking": "55591023930",
+ "order": "306",
+ "service": "UPS"
+ },
+ "intent": "Update order #306 with the UPS tracking number 55591023930",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/306/active_tab/order_shipments/",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tracking number 55591023930 for United Parcel Service assigned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 284
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 499,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}",
+ "instantiation_dict": {
+ "tracking": "13849373987",
+ "order": "304",
+ "service": "USPS"
+ },
+ "intent": "Update order #304 with the USPS tracking number 13849373987",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/304/active_tab/order_shipments/",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tracking number 13849373987 for United States Postal Service assigned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 284
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 500,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update order #{{order}} with the {{service}} tracking number {{tracking}}",
+ "instantiation_dict": {
+ "tracking": "239028439840",
+ "order": "301",
+ "service": "DHL"
+ },
+ "intent": "Update order #301 with the DHL tracking number 239028439840",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/commentsHistory/order_id/301/active_tab/order_shipments/",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Tracking number 239028439840 for DHL assigned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 284
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 501,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Make all {{product}} as out of stock",
+ "instantiation_dict": {
+ "product": "Taurus Elements Shell"
+ },
+ "intent": "Make all Taurus Elements Shell as out of stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/350/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 287
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 502,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Make all {{product}} as out of stock",
+ "instantiation_dict": {
+ "product": "Gobi HeatTec Tee"
+ },
+ "intent": "Make all Gobi HeatTec Tee as out of stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/446/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 287
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 503,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Make all {{product}} as out of stock",
+ "instantiation_dict": {
+ "product": "rocco gym tank"
+ },
+ "intent": "Make all rocco gym tank as out of stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/682/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 287
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 504,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Make all {{product}} as out of stock",
+ "instantiation_dict": {
+ "product": "Selene yoga hoodie"
+ },
+ "intent": "Make all Selene yoga hoodie as out of stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1108/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 287
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 505,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Make all {{product}} as out of stock",
+ "instantiation_dict": {
+ "product": "Aeno capri"
+ },
+ "intent": "Make all Aeno capri as out of stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1861/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 287
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 506,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.",
+ "instantiation_dict": {
+ "product_category": "meat substitute",
+ "dollar_value": "between 100 and 200"
+ },
+ "intent": "Buy the highest rated product from the meat substitute category within a budget between 100 and 200.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B01CTR3DLE"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 172
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 507,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.",
+ "instantiation_dict": {
+ "product_category": "Ceiling light",
+ "dollar_value": "above 1000"
+ },
+ "intent": "Buy the highest rated product from the Ceiling light category within a budget above 1000.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B07BVL3P1V"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 172
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 508,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Buy the highest rated product from the {{product_category}} category within a budget {{dollar_value}}.",
+ "instantiation_dict": {
+ "product_category": "NS switch pouch",
+ "dollar_value": "under 60"
+ },
+ "intent": "Buy the highest rated product from the NS switch pouch category within a budget under 60.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B07116LGP6"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 172
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 509,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive",
+ "instantiation_dict": {
+ "category": "Men's shoe"
+ },
+ "intent": "Buy the best rating product from \"Men's shoe\" category with at least 5 reviews and the product is least expensive",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B01J4MM3KO"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 216
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 510,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Buy the best rating product from \"{{category}}\" category with at least 5 reviews and the product is least expensive",
+ "instantiation_dict": {
+ "category": "Home Audio Speaker"
+ },
+ "intent": "Buy the best rating product from \"Home Audio Speaker\" category with at least 5 reviews and the product is least expensive",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "func:shopping_get_latest_order_url()",
+ "locator": "document.querySelector(\".order-details-items.ordered\").outerText",
+ "required_contents": {
+ "must_include": [
+ "B002R5ABIW"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 216
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 511,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add a {{product}} to my wish list.",
+ "instantiation_dict": {
+ "product": "laundry detergent"
+ },
+ "intent": "Add a laundry detergent to my wish list.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "laundry",
+ "detergent"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 189
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 512,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add a {{product}} to my wish list.",
+ "instantiation_dict": {
+ "product": "toothpaste"
+ },
+ "intent": "Add a toothpaste to my wish list.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "toothpaste"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 189
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 513,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add a {{product}} to my wish list.",
+ "instantiation_dict": {
+ "product": "chair"
+ },
+ "intent": "Add a chair to my wish list.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "chair"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 189
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 514,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add a {{product}} to my wish list.",
+ "instantiation_dict": {
+ "product": "white desk"
+ },
+ "intent": "Add a white desk to my wish list.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "white",
+ "desk"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 189
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 515,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Add a {{product}} to my wish list.",
+ "instantiation_dict": {
+ "product": "white computer desk"
+ },
+ "intent": "Add a white computer desk to my wish list.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "white",
+ "computer",
+ "desk"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 189
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 516,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/elmwood-inn-fine-teas-orange-vanilla-caffeine-free-fruit-infusion-16-ounce-pouch.html",
+ "geolocation": null,
+ "intent_template": "Add this product to my wishlist",
+ "instantiation_dict": {},
+ "intent": "Add this product to my wishlist",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Elmwood Inn Fine Teas, Orange Vanilla Caffeine-free Fruit Infusion, 16-Ounce Pouch"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 196
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 517,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/skinit-decal-gaming-skin-compatible-with-xbox-one-s-console-and-controller-bundle-officially-licensed-nfl-baltimore-ravens-design.html",
+ "geolocation": null,
+ "intent_template": "Add this product to my wishlist",
+ "instantiation_dict": {},
+ "intent": "Add this product to my wishlist",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Skinit Decal Gaming Skin Compatible with Xbox One S Console and Controller Bundle - Officially Licensed NFL Baltimore Ravens Design"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 196
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 518,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/sceptre-e195bd-srr-19-inch-720p-led-tv-true-black-2017.html",
+ "geolocation": null,
+ "intent_template": "Add this product to my wishlist",
+ "instantiation_dict": {},
+ "intent": "Add this product to my wishlist",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Sceptre E195BD-SRR 19-Inch 720P LED TV, True Black (2017)"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 196
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 519,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/iphone-13-pro-max-case-neon-turtle-iphone-13-pro-max-cases-tempered-glass-back-soft-silicone-tpu-shock-protective-case-for-apple-iphone-13-pro-max.html",
+ "geolocation": null,
+ "intent_template": "Add this product to my wishlist",
+ "instantiation_dict": {},
+ "intent": "Add this product to my wishlist",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "iPhone 13 Pro Max Case, Neon Turtle iPhone 13 Pro Max Cases, Tempered Glass Back+Soft Silicone TPU Shock Protective Case for Apple iPhone 13 Pro Max"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 196
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 520,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__/magnetic-metal-stainless-steel-d-pads-kits-directional-pad-replacement-parts-for-xbox-one-elite-controller-elite-series-2-xbox-one-xbox-one-s-x-controller.html",
+ "geolocation": null,
+ "intent_template": "Add this product to my wishlist",
+ "instantiation_dict": {},
+ "intent": "Add this product to my wishlist",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/wishlist/",
+ "locator": "document.querySelector('.products-grid.wishlist').outerText",
+ "required_contents": {
+ "must_include": [
+ "Magnetic Metal Stainless Steel D-pads Kits Directional Pad Replacement Parts for Xbox One Elite Controller, Elite Series 2, Xbox One, Xbox One S/X Controller"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 196
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 521,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Subscribe to the newsletter of OneStopMarket",
+ "instantiation_dict": {},
+ "intent": "Subscribe to the newsletter of OneStopMarket",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/newsletter/manage/",
+ "locator": "document.querySelector('[title=\"General Subscription\"').checked.toString()",
+ "required_contents": {
+ "exact_match": "true"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 199
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 522,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Fork {{repo}}.",
+ "instantiation_dict": {
+ "repo": "all repos from facebook"
+ },
+ "intent": "Fork all repos from facebook.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/projects",
+ "locator": "document.querySelector('[data-qa-selector=\"projects_list\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "create-react-app",
+ "buck"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 352
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 523,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Star the top {{number}} most stared repos in Gitlab",
+ "instantiation_dict": {
+ "number": "five"
+ },
+ "intent": "Star the top five most stared repos in Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/starred",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "AndroidSlidingUpPanel",
+ "create-react-app",
+ "ffmpeg-python",
+ "PHP_XLSXWriter",
+ "AndroidAsync"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 354
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 524,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Star the top {{number}} most stared repos in Gitlab",
+ "instantiation_dict": {
+ "number": "eight"
+ },
+ "intent": "Star the top eight most stared repos in Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/starred",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "AndroidSlidingUpPanel",
+ "create-react-app",
+ "ffmpeg-python",
+ "PHP_XLSXWriter",
+ "AndroidAsync",
+ "Pytorch-GAN",
+ "administrate",
+ "keycloak"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 354
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 525,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Star the top {{number}} most stared repos in Gitlab",
+ "instantiation_dict": {
+ "number": "four"
+ },
+ "intent": "Star the top four most stared repos in Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/starred",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "AndroidSlidingUpPanel",
+ "create-react-app",
+ "ffmpeg-python",
+ "PHP_XLSXWriter"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 354
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 526,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Star the top {{number}} most stared repos in Gitlab",
+ "instantiation_dict": {
+ "number": "three"
+ },
+ "intent": "Star the top three most stared repos in Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/starred",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "AndroidSlidingUpPanel",
+ "create-react-app",
+ "ffmpeg-python"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 354
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 527,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Star the top {{number}} most stared repos in Gitlab",
+ "instantiation_dict": {
+ "number": "one"
+ },
+ "intent": "Star the top one most stared repos in Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/starred",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "AndroidSlidingUpPanel"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 354
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 528,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "instantiation_dict": {
+ "product": "phone screen protector",
+ "time": "March 2023"
+ },
+ "intent": "Draft a refund message via their \"contact us\" form for the phone screen protector I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 154
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 529,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "instantiation_dict": {
+ "product": "bluetooth speaker",
+ "time": "Feb 2023"
+ },
+ "intent": "Draft a refund message via their \"contact us\" form for the bluetooth speaker I bought Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 154
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 530,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "instantiation_dict": {
+ "product": "kitchen organizer",
+ "time": "around Feb 2023"
+ },
+ "intent": "Draft a refund message via their \"contact us\" form for the kitchen organizer I bought around Feb 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 154
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 531,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "instantiation_dict": {
+ "product": "phone case",
+ "time": "March 2023"
+ },
+ "intent": "Draft a refund message via their \"contact us\" form for the phone case I bought March 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 154
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 532,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft a refund message via their \"contact us\" form for the {{product}} I bought {{time}}. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "instantiation_dict": {
+ "product": "PS3 remote controller",
+ "time": "early 2023"
+ },
+ "intent": "Draft a refund message via their \"contact us\" form for the PS3 remote controller I bought early 2023. It broke after three days of use. The shop requires the order id, the reason and the amount to refund in the message. Don't submit yet",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 154
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 533,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Follow {{account_list}} on Gitlab",
+ "instantiation_dict": {
+ "account_list": [
+ "convexegg",
+ "yjlou"
+ ]
+ },
+ "intent": "Follow ['convexegg', 'yjlou'] on Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/following",
+ "locator": "document.querySelector('.user-profile').outerText",
+ "required_contents": {
+ "must_include": [
+ "@convexegg",
+ "@yjlou"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 330
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 534,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Follow {{account_list}} on Gitlab",
+ "instantiation_dict": {
+ "account_list": [
+ "Jakub Klinkovsk\u00fd",
+ "Koushik",
+ "Vinta Chen"
+ ]
+ },
+ "intent": "Follow ['Jakub Klinkovsk\u00fd', 'Koushik', 'Vinta Chen'] on Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/following",
+ "locator": "document.querySelector('.user-profile').outerText",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@koush",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 330
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 535,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Follow {{account_list}} on Gitlab",
+ "instantiation_dict": {
+ "account_list": [
+ "Jakub K",
+ "ghost",
+ "Beno\u00eet Blanchon"
+ ]
+ },
+ "intent": "Follow ['Jakub K', 'ghost', 'Beno\u00eet Blanchon'] on Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/following",
+ "locator": "document.querySelector('.user-profile').outerText",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@ghost",
+ "@bblanchon"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 330
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 536,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Follow {{account_list}} on Gitlab",
+ "instantiation_dict": {
+ "account_list": [
+ "ghost",
+ "R1kk3r",
+ "Abishek"
+ ]
+ },
+ "intent": "Follow ['ghost', 'R1kk3r', 'Abishek'] on Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/following",
+ "locator": "document.querySelector('.user-profile').outerText",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@R1kk3r",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 330
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 537,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Follow {{account_list}} on Gitlab",
+ "instantiation_dict": {
+ "account_list": [
+ "Jakub Klinkovsk",
+ "convexegg",
+ "Vinta Chen",
+ "yjlou",
+ "Abishek S"
+ ]
+ },
+ "intent": "Follow ['Jakub Klinkovsk', 'convexegg', 'Vinta Chen', 'yjlou', 'Abishek S'] on Gitlab",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/users/byteblaze/following",
+ "locator": "document.querySelector('.user-profile').outerText",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@convexegg",
+ "@vinta",
+ "@yjlou",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 330
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 538,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Modify the address of order #{{order_id}} to {{address}}",
+ "instantiation_dict": {
+ "order_id": "299",
+ "address": "456 Oak Avenue, Apartment 5B, New York, NY, 10001"
+ },
+ "intent": "Modify the address of order #299 to 456 Oak Avenue, Apartment 5B, New York, NY, 10001",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/299",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "456 Oak Avenue",
+ "Apartment 5B",
+ "New York",
+ "10001"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 240
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 539,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Modify the address of order #{{order_id}} to {{address}}",
+ "instantiation_dict": {
+ "order_id": "65",
+ "address": "789 Pine Lane, San Francisco, CA, 94102"
+ },
+ "intent": "Modify the address of order #65 to 789 Pine Lane, San Francisco, CA, 94102",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/65",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "789 Pine Lane",
+ "San Francisco",
+ "California",
+ "94102"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 240
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 540,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Modify the address of order #{{order_id}} to {{address}}",
+ "instantiation_dict": {
+ "order_id": "301",
+ "address": "321 Birch Boulevard, Suite 200, Dallas, TX, 75201"
+ },
+ "intent": "Modify the address of order #301 to 321 Birch Boulevard, Suite 200, Dallas, TX, 75201",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/301",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "321 Birch Boulevard",
+ "Suite 200",
+ "Dallas",
+ "Texas",
+ "75201"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 240
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 541,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Modify the address of order #{{order_id}} to {{address}}",
+ "instantiation_dict": {
+ "order_id": "125",
+ "address": "654 Elm Drive, Apartment 12, Miami, FL, 33101"
+ },
+ "intent": "Modify the address of order #125 to 654 Elm Drive, Apartment 12, Miami, FL, 33101",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/125",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "654 Elm Drive",
+ "Apartment 12",
+ "Miami",
+ "Florida",
+ "33101"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 240
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 542,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Modify the address of order #{{order_id}} to {{address}}",
+ "instantiation_dict": {
+ "order_id": "300",
+ "address": "987 Cedar Court, Los Angeles, CA, 90012"
+ },
+ "intent": "Modify the address of order #300 to 987 Cedar Court, Los Angeles, CA, 90012",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/sales/order/view/order_id/300",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "987 Cedar Court",
+ "Los Angeles",
+ "California",
+ "90012"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 240
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 543,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update the product description of {{product}} to highlight the real user positive reviews by quoting the comments",
+ "instantiation_dict": {
+ "product": "Bella Tank"
+ },
+ "intent": "Update the product description of Bella Tank to highlight the real user positive reviews by quoting the comments",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/../bella-tank.html",
+ "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')",
+ "required_contents": {
+ "must_include": [
+ "Good choice for working out and stylin' enough to wear when I'm hanging with friends on hot days",
+ "Also washes really well",
+ "Always a sweet n sporty look for the gym",
+ "Keeps me cool and the seams don't rub up against me like some of my other tanks"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 251
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 544,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments",
+ "instantiation_dict": {
+ "product": "Selena Yoga Hoodie"
+ },
+ "intent": "Update the description of Selena Yoga Hoodie to highlight the real user positive reviews by quoting the comments",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/../selene-yoga-hoodie.html",
+ "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')",
+ "required_contents": {
+ "must_include": [
+ "I was super cold and it did the job.",
+ "The sleeves are definitely thicker than you realize, which is a good thing",
+ "really quite substantial",
+ "planning on buying another one of these in another color",
+ "the best hoodie ive ever owned"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 251
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 545,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments",
+ "instantiation_dict": {
+ "product": "Radiant Tee"
+ },
+ "intent": "Update the description of Radiant Tee to highlight the real user positive reviews by quoting the comments",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/../radiant-tee.html",
+ "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')",
+ "required_contents": {
+ "must_include": [
+ "What I rally love here is that it does the job of keeping me cool and dry",
+ "I'm a big guy and sweat A LOT",
+ "Even after a day of gulf, I'm still dry and comfortable",
+ "What a versatile shirt",
+ "Not only does it feel very soft compared to my old worn out polos, but it also does the job promised",
+ "I like going out after my game for drinks so I look good then too and don't need to change into something fresh"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 251
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 546,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Update the description of {{product}} to highlight the real user positive reviews by quoting the comments",
+ "instantiation_dict": {
+ "product": "Lucia Cross-Fit Bra"
+ },
+ "intent": "Update the description of Lucia Cross-Fit Bra to highlight the real user positive reviews by quoting the comments",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/../affirm-water-bottle.html",
+ "locator": "document.querySelector('.data.item.content').outerText + (document.querySelector('.product.attribute.overview [itemprop=\"description\"]')?.outerText || '')",
+ "required_contents": {
+ "must_include": [
+ "Wide mouth opening makes it easy to clean"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 251
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 547,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a new {{option}} option {{value}} to the {{base_setting}} of {{product}}",
+ "instantiation_dict": {
+ "option": "color",
+ "value": "brown",
+ "base_setting": "size S",
+ "product": "Phoebe Zipper Sweatshirt"
+ },
+ "intent": "Add a new color option brown to the size S of Phoebe Zipper Sweatshirt",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1130/",
+ "locator": "document.querySelector('[data-index=\"configurable\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Phoebe Zipper Sweatshirt-S-Brown"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 252
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 548,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} of {{product}}",
+ "instantiation_dict": {
+ "option": "color",
+ "value": "blue",
+ "base_setting": "size S and M",
+ "product": "Frankie Sweatshirt"
+ },
+ "intent": "Add a new color blue to size S and M of Frankie Sweatshirt",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/110/",
+ "locator": "document.querySelector('[data-index=\"configurable\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Sweatshirt-M-Blue",
+ "Sweatshirt-S-Blue"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 252
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 549,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}",
+ "instantiation_dict": {
+ "option": "size",
+ "value": "XXXL",
+ "base_setting": "green",
+ "product": "Minerva LumaTech V-Tee"
+ },
+ "intent": "Add a new size XXXL to green Minerva LumaTech V-Tee",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1492/",
+ "locator": "document.querySelector('[data-index=\"configurable\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "V-Tee-XXXL-Green"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 252
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 550,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a new {{option}} {{value}} to {{base_setting}} {{product}}",
+ "instantiation_dict": {
+ "option": "size",
+ "value": "XXS",
+ "base_setting": "blue and purple",
+ "product": "Nona Fitness Tank"
+ },
+ "intent": "Add a new size XXS to blue and purple Nona Fitness Tank",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1732/",
+ "locator": "document.querySelector('[data-index=\"configurable\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Tank-XXS-Blue",
+ "Tank-XXS-Purple"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 252
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 551,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add new {{option}} {{value}} to {{base_setting}} of {{product}}",
+ "instantiation_dict": {
+ "option": "size",
+ "value": "30 and 31",
+ "base_setting": "all color variants",
+ "product": "Diana Tights"
+ },
+ "intent": "Add new size 30 and 31 to all color variants of Diana Tights",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1854/",
+ "locator": "document.querySelector('[data-index=\"configurable\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Tights-30-Blue",
+ "Tights-30-Black",
+ "Tights-30-Orange",
+ "Tights-31-Blue",
+ "Tights-31-Black",
+ "Tights-31-Orange"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 252
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 552,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?",
+ "instantiation_dict": {
+ "directory": "real_space",
+ "subreddit": "space",
+ "gitlab_repo": "gimmiethat.space"
+ },
+ "intent": "Create a folder named real_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the space?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/real_space/urls.txt",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/space/134164/scientists-erupt-at-nasa-gutting-funding-for-crucial-venus",
+ "__REDDIT__/f/space/134163/virgin-orbit-fails-to-secure-funding-will-cease-operations",
+ "__REDDIT__/f/space/134162/nasa-to-name-artemis-2-crew-next-week-the-first-moon",
+ "__REDDIT__/f/space/134161/bent-light-in-deep-space-reveals-one-of-the-biggest-black",
+ "__REDDIT__/f/space/134160/seti-s-new-machine-learning-algorithm-works-like-google-s"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 84
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 553,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?",
+ "instantiation_dict": {
+ "directory": "news",
+ "gitlab_repo": "gimmiethat.space",
+ "subreddit": "news related subreddits"
+ },
+ "intent": "Create a folder named news in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the news related subreddits?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/news/urls.txt",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/news/129905/ohio-man-charged-for-using-molotov-cocktails-to-attack",
+ "__REDDIT__/f/news/129904/in-a-loss-for-fox-news-judge-allows-dominion-s-defamation",
+ "__REDDIT__/f/news/129903/theater-group-sues-to-block-tennessee-s-new-anti-drag-law",
+ "__REDDIT__/f/news/129902/andrew-tate-released-from-jail-in-romania-and-placed-under",
+ "__REDDIT__/f/news/129901/rare-high-risk-storm-alert-issued-for-parts-of-midwest-and"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 84
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 554,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?",
+ "instantiation_dict": {
+ "directory": "moive_space",
+ "gitlab_repo": "gimmiethat.space",
+ "subreddit": "movies"
+ },
+ "intent": "Create a folder named moive_space in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the movies?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/moive_space/urls.txt",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/movies/128825/scenes-in-film-that-feel-off-or-wrong-in-some-way-and-make",
+ "__REDDIT__/f/movies/128824/disney-s-live-action-lilo-amp-stitch-movie-finds-its-lilo-in",
+ "__REDDIT__/f/movies/128823/fantastic-four-movie-gets-new-writer-with-avatar-the-way-of",
+ "__REDDIT__/f/movies/128822/can-someone-explain-what-made-steven-seagal-so-appealing-for",
+ "__REDDIT__/f/movies/128821/ban-on-fetish-sex-depictions-in-film-should-end-australia"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 84
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 555,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a folder named {{directory}} in {{gitlab_repo}} repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the {{subreddit}}?",
+ "instantiation_dict": {
+ "directory": "funny_pic",
+ "gitlab_repo": "gimmiethat.space",
+ "subreddit": "memes"
+ },
+ "intent": "Create a folder named funny_pic in gimmiethat.space repo. Within it, create a file named urls.txt that contains the URLs of the 5 most recent posts from the memes?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/raw/main/funny_pic/urls.txt",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/memes/127991/it-do-be-like-that-tho",
+ "__REDDIT__/f/memes/127990/thank-you-memers-this-wouldn-t-be-possible-without-you",
+ "__REDDIT__/f/memes/127989/if-you-have-no-other-choice",
+ "__REDDIT__/f/memes/127988/yes-yes-yes",
+ "__REDDIT__/f/memes/127987/shagadelic-baby"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 84
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 556,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "nolan_honest_fans",
+ "topics": "movies directed by Christopher Nolan"
+ },
+ "intent": "Create a repo named nolan_honest_fans with movies directed by Christopher Nolan in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/nolan_honest_fans/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Following",
+ "Memento",
+ "Insomnia",
+ "Batman Begins",
+ "The Prestige",
+ "The Dark Knight",
+ "Inception",
+ "The Dark Knight Rises",
+ "Interstellar",
+ "Dunkirk",
+ "Tenet",
+ "Oppenheimer"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 557,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "nolan_old_fans",
+ "topics": "movies directed by Christopher Nolan before 2010"
+ },
+ "intent": "Create a repo named nolan_old_fans with movies directed by Christopher Nolan before 2010 in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/nolan_old_fans/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Following",
+ "Memento",
+ "Insomnia",
+ "Batman Begins",
+ "The Prestige",
+ "The Dark Knight"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 558,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "nolan_young_fans",
+ "topics": "movies directed by Christopher Nolan after 2010"
+ },
+ "intent": "Create a repo named nolan_young_fans with movies directed by Christopher Nolan after 2010 in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/nolan_young_fans/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Inception",
+ "The Dark Knight Rises",
+ "Interstellar",
+ "Dunkirk",
+ "Tenet",
+ "Oppenheimer"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 559,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "nolan_followers",
+ "topics": "career timeline of Christopher Nolan"
+ },
+ "intent": "Create a repo named nolan_followers with career timeline of Christopher Nolan in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/nolan_followers/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "1993\u20132003: Early career and breakthrough",
+ "2003\u20132013: Widespread recognition",
+ "2014\u20132019: Established Hollywood auteur",
+ "2020\u2013present"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 560,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "nolan_academy_awards",
+ "topics": "movies that won Academy Awards by Christopher Nolan"
+ },
+ "intent": "Create a repo named nolan_academy_awards with movies that won Academy Awards by Christopher Nolan in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/nolan_academy_awards/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "The Dark Knight",
+ "Inception",
+ "Interstellar",
+ "Dunkirk",
+ "Tenet"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "wikipedia"
+ ],
+ "task_id": 561,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a repo named {{name}} with {{topics}} in a README file",
+ "instantiation_dict": {
+ "name": "bafta_awards_nolan",
+ "topics": "movies that are nominated BAFTA Awards by Christopher Nolan"
+ },
+ "intent": "Create a repo named bafta_awards_nolan with movies that are nominated BAFTA Awards by Christopher Nolan in a README file",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/bafta_awards_nolan/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Batman Begins",
+ "The Dark Knight",
+ "Inception",
+ "The Dark Knight Rises",
+ "Interstellar",
+ "Dunkirk",
+ "Tenet"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 87
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 562,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?",
+ "instantiation_dict": {
+ "name": "Awesome_DIY_ideas",
+ "num": 6
+ },
+ "intent": "create a repository named Awesome_DIY_ideas that includes a README file with the links to the most active 6 DIY ideas on DIY subreddit?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/Awesome_DIY_ideas/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts",
+ "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess",
+ "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing",
+ "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit",
+ "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches",
+ "__REDDIT__/f/DIY/118931/afci-outlet-question"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 88
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 563,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?",
+ "instantiation_dict": {
+ "name": "fun_thing_to_do",
+ "num": 5
+ },
+ "intent": "create a repository named fun_thing_to_do that includes a README file with the links to the most active 5 DIY ideas on DIY subreddit?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/fun_thing_to_do/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts",
+ "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess",
+ "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing",
+ "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit",
+ "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 88
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 564,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?",
+ "instantiation_dict": {
+ "name": "live_a_life",
+ "num": 3
+ },
+ "intent": "create a repository named live_a_life that includes a README file with the links to the most active 3 DIY ideas on DIY subreddit?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/live_a_life/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts",
+ "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess",
+ "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 88
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 565,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?",
+ "instantiation_dict": {
+ "name": "TODO",
+ "num": 10
+ },
+ "intent": "create a repository named TODO that includes a README file with the links to the most active 10 DIY ideas on DIY subreddit?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/TODO/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts",
+ "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess",
+ "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing",
+ "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit",
+ "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches",
+ "__REDDIT__/f/DIY/118931/afci-outlet-question",
+ "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe",
+ "__REDDIT__/f/DIY/118866/paver-base-for-shed",
+ "__REDDIT__/f/DIY/118820/ways-to-locate-our-buried-electrical-service",
+ "__REDDIT__/f/DIY/118836/how-to-eliminate-transitions-for-disability-mobility-reasons"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 88
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 566,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a repository named {{name}} that includes a README file with the links to the most active {{num}} DIY ideas on DIY subreddit?",
+ "instantiation_dict": {
+ "name": "Do it myself",
+ "num": 8
+ },
+ "intent": "create a repository named Do it myself that includes a README file with the links to the most active 8 DIY ideas on DIY subreddit?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/Do-it-myself/-/raw/main/README.md",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "__REDDIT__/f/DIY/118903/separate-glued-plastic-parts",
+ "__REDDIT__/f/DIY/118923/how-would-you-fix-this-dryer-vent-mess",
+ "__REDDIT__/f/DIY/118935/basement-bulkhead-soffit-wall-framing",
+ "__REDDIT__/f/DIY/118904/ge-water-heater-pilot-light-won-t-stay-lit",
+ "__REDDIT__/f/DIY/118960/attempting-to-move-a-wall-outlet-in-my-basement-a-few-inches",
+ "__REDDIT__/f/DIY/118931/afci-outlet-question",
+ "__REDDIT__/f/DIY/118824/teflon-tape-to-attach-washing-machine-drain-hose-to-pipe",
+ "__REDDIT__/f/DIY/118866/paver-base-for-shed"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 88
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 567,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo",
+ "instantiation_dict": {
+ "collaborator_account_list": "Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon",
+ "repo": "gimmiethat.space"
+ },
+ "intent": "Invite Jakub Klinkovsk\u00fd and Beno\u00eet Blanchon as collaborator to gimmiethat.space repo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@bblanchon"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 293
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 568,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo",
+ "instantiation_dict": {
+ "collaborator_account_list": "Abishek and Vinta",
+ "repo": "a11yproject.com"
+ },
+ "intent": "Invite Abishek and Vinta as collaborator to a11yproject.com repo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/a11yproject/a11yproject.com/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@abisubramanya27",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 293
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 569,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo",
+ "instantiation_dict": {
+ "collaborator_account_list": "Beno\u00eet and Abishek",
+ "repo": "my HTML5 markup extention"
+ },
+ "intent": "Invite Beno\u00eet and Abishek as collaborator to my HTML5 markup extention repo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/accessible-html-content-patterns/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@bblanchon",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 293
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 570,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Invite {{collaborator_account_list}} as collaborator to {{repo}} repo",
+ "instantiation_dict": {
+ "collaborator_account_list": "Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon",
+ "repo": "my time tracking tool project"
+ },
+ "intent": "Invite Jakub K, Alex Dills, Alex Hutnik and Beno\u00eet Blanchon as collaborator to my time tracking tool project repo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/timeit/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@lahwaacz",
+ "@V13Axel",
+ "@alexhutnik",
+ "@bblanchon"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 293
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 571,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly",
+ "instantiation_dict": {
+ "address": "231 Willow Way, Suite 100, Chicago, IL, 60601"
+ },
+ "intent": "I recently moved, my address is 231 Willow Way, Suite 100, Chicago, IL, 60601, update my information on OneStopShopping accordingly",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "231 Willow Way",
+ "Suite 100",
+ "Chicago, Illinois, 60601"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "231 Willow Way",
+ "Suite 100",
+ "Chicago, Illinois, 60601"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 165
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 572,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly",
+ "instantiation_dict": {
+ "address": "654 Aspen Road, House #3, Boston, MA, 02110"
+ },
+ "intent": "I recently moved, my address is 654 Aspen Road, House #3, Boston, MA, 02110, update my information on OneStopShopping accordingly",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "654 Aspen Road",
+ "House #3",
+ "Boston, Massachusetts, 02110"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "654 Aspen Road",
+ "House #3",
+ "Boston, Massachusetts, 02110"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 165
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 573,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly",
+ "instantiation_dict": {
+ "address": "987 Sycamore Circle, Philadelphia, PA, 19102"
+ },
+ "intent": "I recently moved, my address is 987 Sycamore Circle, Philadelphia, PA, 19102, update my information on OneStopShopping accordingly",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "987 Sycamore Circle",
+ "Philadelphia, Pennsylvania, 19102"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "987 Sycamore Circle",
+ "Philadelphia, Pennsylvania, 19102"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 165
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 574,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly",
+ "instantiation_dict": {
+ "address": "111 Magnolia Path, Atlanta, GA, 30303"
+ },
+ "intent": "I recently moved, my address is 111 Magnolia Path, Atlanta, GA, 30303, update my information on OneStopShopping accordingly",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "111 Magnolia Path",
+ "Atlanta, Georgia, 30303"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "111 Magnolia Path",
+ "Atlanta, Georgia, 30303"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 165
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 575,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "I recently moved, my address is {{address}}, update my information on OneStopShopping accordingly",
+ "instantiation_dict": {
+ "address": "222 Redwood Rise, Suite 300, Seattle, WA, 98101"
+ },
+ "intent": "I recently moved, my address is 222 Redwood Rise, Suite 300, Seattle, WA, 98101, update my information on OneStopShopping accordingly",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-shipping > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "222 Redwood Rise",
+ "Suite 300",
+ "Seattle, Washington, 98101"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING__/customer/address",
+ "locator": "document.querySelector(\".box.box-address-billing > .box-content\").outerText",
+ "required_contents": {
+ "must_include": [
+ "222 Redwood Rise",
+ "Suite 300",
+ "Seattle, Washington, 98101"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 165
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 576,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}",
+ "instantiation_dict": {
+ "repo": "a11y-webring.club",
+ "role": "developer",
+ "user_list": [
+ "abisubramanya27",
+ "lahwaacz"
+ ]
+ },
+ "intent": "Add the following users to repo a11y-webring.club as developer: ['abisubramanya27', 'lahwaacz']",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')",
+ "required_contents": {
+ "must_include": [
+ "Developer"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/a11y-webring.club/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')",
+ "required_contents": {
+ "must_include": [
+ "Developer"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 351
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 577,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}",
+ "instantiation_dict": {
+ "repo": "GitHub timeline item management extension",
+ "role": "maintainer",
+ "user_list": [
+ "abisubramanya27",
+ "lahwaacz"
+ ]
+ },
+ "intent": "Add the following users to my GitHub timeline item management extension as maintainer: ['abisubramanya27', 'lahwaacz']",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'abisubramanya27')",
+ "required_contents": {
+ "must_include": [
+ "Maintainer"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/remove-board-movement-events-from-the-github-issue-timeline/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'lahwaacz')",
+ "required_contents": {
+ "must_include": [
+ "Maintainer"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 351
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 578,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}",
+ "instantiation_dict": {
+ "repo": "millennials-to-snake-people",
+ "role": "reporter",
+ "user_list": [
+ "yjlou",
+ "a11yproject"
+ ]
+ },
+ "intent": "Add the following users to repo millennials-to-snake-people as reporter: ['yjlou', 'a11yproject']",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')",
+ "required_contents": {
+ "must_include": [
+ "Reporter"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/millennials-to-snake-people/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'a11yproject')",
+ "required_contents": {
+ "must_include": [
+ "Reporter"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 351
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 579,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Add the following users to my {{repo}} as {{role}}: {{user_list}}",
+ "instantiation_dict": {
+ "repo": "time tracking tool",
+ "role": "guest",
+ "user_list": [
+ "yjlou"
+ ]
+ },
+ "intent": "Add the following users to my time tracking tool as guest: ['yjlou']",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/timeit/-/project_members",
+ "locator": "func:gitlab_get_project_memeber_role(__page__, 'yjlou')",
+ "required_contents": {
+ "must_include": [
+ "Guest"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 351
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 580,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?",
+ "instantiation_dict": {
+ "name": "sci_fi",
+ "description": "A wild place for sci-fi enthusiasts",
+ "sidebar_list": [
+ "New",
+ "Classic",
+ "Movies",
+ "Post my novel",
+ "Random"
+ ]
+ },
+ "intent": "Create a new forum named sci_fi, with a description of A wild place for sci-fi enthusiasts, and include ['New', 'Classic', 'Movies', 'Post my novel', 'Random'] in the sidebar?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/sci_fi/edit",
+ "locator": "document.querySelector(\"#forum_description\").value",
+ "required_contents": {
+ "must_include": [
+ "A wild place for sci-fi enthusiasts"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/sci_fi/edit",
+ "locator": "document.querySelector(\"#forum_sidebar\").value",
+ "required_contents": {
+ "must_include": [
+ "New",
+ "Classic",
+ "Movies",
+ "Post my novel",
+ "Random"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 7
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 581,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?",
+ "instantiation_dict": {
+ "name": "cmu_lti",
+ "description": "Language Technologies Institute at Carnegie Mellon University",
+ "sidebar_list": [
+ "announcement",
+ "paper",
+ "alumni"
+ ]
+ },
+ "intent": "Create a new forum named cmu_lti, with a description of Language Technologies Institute at Carnegie Mellon University, and include ['announcement', 'paper', 'alumni'] in the sidebar?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/cmu_lti/edit",
+ "locator": "document.querySelector(\"#forum_description\").value",
+ "required_contents": {
+ "must_include": [
+ "Language Technologies Institute at Carnegie Mellon University"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/cmu_lti/edit",
+ "locator": "document.querySelector(\"#forum_sidebar\").value",
+ "required_contents": {
+ "must_include": [
+ "announcement",
+ "paper",
+ "alumni"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 7
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 582,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?",
+ "instantiation_dict": {
+ "name": "Cyberpunk",
+ "description": "Welcome to the future",
+ "sidebar_list": [
+ "Games",
+ "Books",
+ "Movies",
+ "Future"
+ ]
+ },
+ "intent": "Create a new forum named Cyberpunk, with a description of Welcome to the future, and include ['Games', 'Books', 'Movies', 'Future'] in the sidebar?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/Cyberpunk/edit",
+ "locator": "document.querySelector(\"#forum_description\").value",
+ "required_contents": {
+ "must_include": [
+ "Welcome to the future"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/Cyberpunk/edit",
+ "locator": "document.querySelector(\"#forum_sidebar\").value",
+ "required_contents": {
+ "must_include": [
+ "Games",
+ "Books",
+ "Movies",
+ "Future"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 7
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 583,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?",
+ "instantiation_dict": {
+ "name": "PlantsForCatParents",
+ "description": "Cat parents & plan lovers",
+ "sidebar_list": [
+ "Cat friendly",
+ "Local vendors",
+ "Promotion",
+ "Toxic plants!"
+ ]
+ },
+ "intent": "Create a new forum named PlantsForCatParents, with a description of Cat parents & plan lovers, and include ['Cat friendly', 'Local vendors', 'Promotion', 'Toxic plants!'] in the sidebar?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/PlantsForCatParents/edit",
+ "locator": "document.querySelector(\"#forum_description\").value",
+ "required_contents": {
+ "must_include": [
+ "Cat parents & plan lovers"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/PlantsForCatParents/edit",
+ "locator": "document.querySelector(\"#forum_sidebar\").value",
+ "required_contents": {
+ "must_include": [
+ "Cat friendly",
+ "Local vendors",
+ "Promotion",
+ "Toxic plants!"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 7
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 584,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a new forum named {{name}}, with a description of {{description}}, and include {{sidebar_list}} in the sidebar?",
+ "instantiation_dict": {
+ "name": "Karaoke",
+ "description": "Place for Karaoke lovers",
+ "sidebar_list": [
+ "devices",
+ "setup"
+ ]
+ },
+ "intent": "Create a new forum named Karaoke, with a description of Place for Karaoke lovers, and include ['devices', 'setup'] in the sidebar?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/Karaoke",
+ "locator": "document.querySelector(\"#forum_description\").value",
+ "required_contents": {
+ "must_include": [
+ "Place for Karaoke lovers"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/Karaoke",
+ "locator": "document.querySelector(\"#forum_sidebar\").value",
+ "required_contents": {
+ "must_include": [
+ "devices",
+ "setup"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 7
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 585,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?",
+ "instantiation_dict": {
+ "product": "floor lamp",
+ "num_star": 5,
+ "nickname": "Emma Lopez"
+ },
+ "intent": "Rate my recent purchase of floor lamp with 5 stars, using my nickname Emma Lopez?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_rating('B00J8RZL7I')",
+ "required_contents": {
+ "must_include": [
+ "100"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_author('B00J8RZL7I')",
+ "required_contents": {
+ "must_include": [
+ "Emma Lopez"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 194
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 586,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?",
+ "instantiation_dict": {
+ "product": "Jiffy Corn Muffin Cornbread Mix",
+ "num_star": 4,
+ "nickname": "ShoppingEmma"
+ },
+ "intent": "Rate my recent purchase of Jiffy Corn Muffin Cornbread Mix with 4 stars, using my nickname ShoppingEmma?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_rating('B07HZB38XH')",
+ "required_contents": {
+ "must_include": [
+ "80"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_author('B07HZB38XH')",
+ "required_contents": {
+ "must_include": [
+ "ShoppingEmma"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 194
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 587,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?",
+ "instantiation_dict": {
+ "product": "PS3 Remote Controllers",
+ "num_star": 3,
+ "nickname": "GamingEmma"
+ },
+ "intent": "Rate my recent purchase of PS3 Remote Controllers with 3 stars, using my nickname GamingEmma?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_rating('B0041MSF2S')",
+ "required_contents": {
+ "must_include": [
+ "60"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_author('B0041MSF2S')",
+ "required_contents": {
+ "must_include": [
+ "GamingEmma"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 194
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 588,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?",
+ "instantiation_dict": {
+ "product": "Foundation For Mattress With Frame Set",
+ "num_star": 1,
+ "nickname": "ShoppingEmma"
+ },
+ "intent": "Rate my recent purchase of Foundation For Mattress With Frame Set with 1 stars, using my nickname ShoppingEmma?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_rating('B07DFJ5XKH')",
+ "required_contents": {
+ "must_include": [
+ "20"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_author('B07DFJ5XKH')",
+ "required_contents": {
+ "must_include": [
+ "ShoppingEmma"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 194
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 589,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Rate my recent purchase of {{product}} with {{num_star}} stars, using my nickname {{nickname}}?",
+ "instantiation_dict": {
+ "product": "Mini Wireless Bluetooth Speaker",
+ "num_star": 2,
+ "nickname": "SimpleEmma"
+ },
+ "intent": "Rate my recent purchase of Mini Wireless Bluetooth Speaker with 2 stars, using my nickname SimpleEmma?",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_rating('B09P7BFL4H')",
+ "required_contents": {
+ "must_include": [
+ "40"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "func:shopping_get_sku_latest_review_author('B09P7BFL4H')",
+ "required_contents": {
+ "must_include": [
+ "SimpleEmma"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 194
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 590,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}",
+ "instantiation_dict": {
+ "event": "event of product launch",
+ "start_date": "1/16/2023",
+ "end_date": "1/30/2023"
+ },
+ "intent": "Create a milestone for the upcoming event of product launch starting on 1/16/2023 and ending on 1/30/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/milestones",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"#content-body\").outerText",
+ "required_contents": {
+ "must_include": [
+ "product launch"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.start_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Jan 16, 2030"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.due_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Jan 30, 2030"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 339
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 591,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}",
+ "instantiation_dict": {
+ "event": "practice of collective code review",
+ "start_date": "1/16/2023",
+ "end_date": "in 20 days"
+ },
+ "intent": "Create a milestone for the upcoming practice of collective code review starting on 1/16/2023 and ending on in 20 days",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/milestones",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"#content-body\").outerText",
+ "required_contents": {
+ "must_include": [
+ "code review"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.start_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Jan 16, 2030"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.due_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Feb 5, 2030"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 339
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 592,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}",
+ "instantiation_dict": {
+ "event": "task of cleaning sensitive information",
+ "start_date": "2/16/2023",
+ "end_date": "in 20 days"
+ },
+ "intent": "Create a milestone for the upcoming task of cleaning sensitive information starting on 2/16/2023 and ending on in 20 days",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/milestones",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"#content-body\").outerText",
+ "required_contents": {
+ "must_include": [
+ "sensitive information"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.start_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Feb 16, 2030"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.due_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Mar 8, 2030"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 339
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 593,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/dotfiles",
+ "geolocation": null,
+ "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}",
+ "instantiation_dict": {
+ "event": "task of merging all branches to main",
+ "start_date": "March 15, 2044",
+ "end_date": "March 30, 2044"
+ },
+ "intent": "Create a milestone for the upcoming task of merging all branches to main starting on March 15, 2044 and ending on March 30, 2044",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"#content-body\").outerText",
+ "required_contents": {
+ "must_include": [
+ "all branches to main"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.start_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Mar 15, 2044"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.due_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "Mar 30, 2044"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 339
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 594,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/dotfiles",
+ "geolocation": null,
+ "intent_template": "Create a milestone for the upcoming {{event}} starting on {{start_date}} and ending on {{end_date}}",
+ "instantiation_dict": {
+ "event": "task of adding a new branch for zsh comprehensive support",
+ "start_date": "5/1/2044",
+ "end_date": "in 20 days"
+ },
+ "intent": "Create a milestone for the upcoming task of adding a new branch for zsh comprehensive support starting on 5/1/2044 and ending on in 20 days",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/dotfiles/-/milestones",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"#content-body\").outerText",
+ "required_contents": {
+ "must_include": [
+ "zsh comprehensive support"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.start_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "May 1, 2044"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.due_date').outerText",
+ "required_contents": {
+ "must_include": [
+ "May 21, 2044"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 339
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 595,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.",
+ "instantiation_dict": {
+ "subreddit": "space"
+ },
+ "intent": "Open the thread of a trending post on the forum \"space\" and subscribe.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/space",
+ "program_html": [
+ {
+ "url": "__REDDIT__/",
+ "locator": "document.querySelector(\"#sidebar > section\").outerText",
+ "required_contents": {
+ "must_include": [
+ "space"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 4
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 596,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.",
+ "instantiation_dict": {
+ "subreddit": "books"
+ },
+ "intent": "Open the thread of a trending post on the forum \"books\" and subscribe.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "__REDDIT__/",
+ "locator": "document.querySelector(\"#sidebar > section\").outerText",
+ "required_contents": {
+ "must_include": [
+ "books"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 4
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 597,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.",
+ "instantiation_dict": {
+ "subreddit": "consoles"
+ },
+ "intent": "Open the thread of a trending post on the forum \"consoles\" and subscribe.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/consoles",
+ "program_html": [
+ {
+ "url": "__REDDIT__/",
+ "locator": "document.querySelector(\"#sidebar > section\").outerText",
+ "required_contents": {
+ "must_include": [
+ "consoles"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 4
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 598,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.",
+ "instantiation_dict": {
+ "subreddit": "pittsburgh"
+ },
+ "intent": "Open the thread of a trending post on the forum \"pittsburgh\" and subscribe.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/pittsburgh",
+ "program_html": [
+ {
+ "url": "__REDDIT__/",
+ "locator": "document.querySelector(\"#sidebar > section\").outerText",
+ "required_contents": {
+ "must_include": [
+ "pittsburgh"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 4
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 599,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Open the thread of a trending post on the forum \"{{subreddit}}\" and subscribe.",
+ "instantiation_dict": {
+ "subreddit": "machine learning"
+ },
+ "intent": "Open the thread of a trending post on the forum \"machine learning\" and subscribe.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/MachineLearning",
+ "program_html": [
+ {
+ "url": "__REDDIT__/",
+ "locator": "document.querySelector(\"#sidebar > section\").outerText",
+ "required_contents": {
+ "must_include": [
+ "MachineLearning"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 4
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 600,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer",
+ "instantiation_dict": {
+ "question": "what is the recommended console to buy these days"
+ },
+ "intent": "Post my question, \"what is the recommended console to buy these days\", in a subreddit where I'm likely to get an answer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/consoles",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "what is the recommended console to buy these days"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 3765
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 601,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer",
+ "instantiation_dict": {
+ "question": "is car necessary in NYC"
+ },
+ "intent": "Post my question, \"is car necessary in NYC\", in a subreddit where I'm likely to get an answer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/nyc",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "is car necessary in NYC"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 3765
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 602,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer",
+ "instantiation_dict": {
+ "question": "places for new drivers to learn driving in pittsburgh"
+ },
+ "intent": "Post my question, \"places for new drivers to learn driving in pittsburgh\", in a subreddit where I'm likely to get an answer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/pittsburgh",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "places for new drivers to learn driving in pittsburgh"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 3765
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 603,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer",
+ "instantiation_dict": {
+ "question": "safe and budge apartment to live in nyc"
+ },
+ "intent": "Post my question, \"safe and budge apartment to live in nyc\", in a subreddit where I'm likely to get an answer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/nyc",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "safe and budge apartment to live in nyc"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 3765
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 604,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post my question, \"{{question}}\", in a subreddit where I'm likely to get an answer",
+ "instantiation_dict": {
+ "question": "what is the SOTA web navigation agent repo"
+ },
+ "intent": "Post my question, \"what is the SOTA web navigation agent repo\", in a subreddit where I'm likely to get an answer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "what is the SOTA web navigation agent repo"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 3765
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 605,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there",
+ "instantiation_dict": {
+ "topic": "gaming consoles",
+ "question": "what is the recommended console to buy these days"
+ },
+ "intent": "Find a subreddit focused on topics related to gaming consoles, and post my question, \"what is the recommended console to buy these days\" there",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/consoles",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "what is the recommended console to buy these days"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 5
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 606,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there",
+ "instantiation_dict": {
+ "topic": "NYC",
+ "question": "is car necessary"
+ },
+ "intent": "Find a subreddit focused on topics related to NYC, and post my question, \"is car necessary\" there",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/nyc",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "is car necessary"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 5
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 607,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there",
+ "instantiation_dict": {
+ "topic": "city Pittsburgh",
+ "question": "places for new drivers to learn driving"
+ },
+ "intent": "Find a subreddit focused on topics related to city Pittsburgh, and post my question, \"places for new drivers to learn driving\" there",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/pittsburgh",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "places for new drivers to learn driving"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 5
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 608,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there",
+ "instantiation_dict": {
+ "topic": "city lives in DMV area",
+ "question": "safe and budge apartment to live"
+ },
+ "intent": "Find a subreddit focused on topics related to city lives in DMV area, and post my question, \"safe and budge apartment to live\" there",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/washington |OR| __REDDIT__/f/washingtondc",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "safe and budge apartment to live"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 5
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 609,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a subreddit focused on topics related to {{topic}}, and post my question, \"{{question}}\" there",
+ "instantiation_dict": {
+ "topic": "ML, DL, NLP",
+ "question": "what is the SOTA web navigation agent repo"
+ },
+ "intent": "Find a subreddit focused on topics related to ML, DL, NLP, and post my question, \"what is the SOTA web navigation agent repo\" there",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/deeplearning |OR| __REDDIT__/f/MachineLearning |OR| __REDDIT__/f/singularity",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "what is the SOTA web navigation agent repo"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 5
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 610,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".",
+ "instantiation_dict": {
+ "book": "To Kill a Mockingbird by Harper Lee",
+ "content": "good book!"
+ },
+ "intent": "Post a review of my recent reading \"To Kill a Mockingbird by Harper Lee\" in the r/books with my comment \"good book!\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "To Kill a Mockingbird by Harper Lee",
+ "good book!"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 9
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 611,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".",
+ "instantiation_dict": {
+ "book": "Harry Potter",
+ "content": "Wonderful journey"
+ },
+ "intent": "Post a review of my recent reading \"Harry Potter\" in the r/books with my comment \"Wonderful journey\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "Harry Potter",
+ "Wonderful journey"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 9
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 612,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".",
+ "instantiation_dict": {
+ "book": "big little lies",
+ "content": "can't stop it"
+ },
+ "intent": "Post a review of my recent reading \"big little lies\" in the r/books with my comment \"can't stop it\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "big little lies",
+ "can't stop it"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 9
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 613,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".",
+ "instantiation_dict": {
+ "book": "Love story",
+ "content": "I cried"
+ },
+ "intent": "Post a review of my recent reading \"Love story\" in the r/books with my comment \"I cried\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "Love story",
+ "I cried"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 9
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 614,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a review of my recent reading \"{{book}}\" in the r/books with my comment \"{{content}}\".",
+ "instantiation_dict": {
+ "book": "Gone with the wind",
+ "content": "It's a book with history"
+ },
+ "intent": "Post a review of my recent reading \"Gone with the wind\" in the r/books with my comment \"It's a book with history\".",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "Gone with the wind",
+ "It's a book with history"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 9
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 615,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pics",
+ "geolocation": null,
+ "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"",
+ "instantiation_dict": {
+ "content": "Bald Eagle",
+ "subreddit": "earthporn"
+ },
+ "intent": "Re-post the image of Bald Eagle in this page to earthporn subreddit and note \"from /f/pics\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/earthporn",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "from /f/pics"
+ ]
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "b02113033af32feae9ff147dbbe3764039368d67d193885bd04e65c2e6beea9c.jpg"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 11
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 616,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pics",
+ "geolocation": null,
+ "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"",
+ "instantiation_dict": {
+ "content": "Thanksgiving turkey",
+ "subreddit": "funny"
+ },
+ "intent": "Re-post the image of Thanksgiving turkey in this page to funny subreddit and note \"from /f/pics\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/funny",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "from /f/pics"
+ ]
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "2e4fa0a328e653a97a7d07046291c298ef5b4e0d0c73a287f317ca86a8e8685f.jpg"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 11
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 617,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pics",
+ "geolocation": null,
+ "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"",
+ "instantiation_dict": {
+ "content": "Firework",
+ "subreddit": "earthporn"
+ },
+ "intent": "Re-post the image of Firework in this page to earthporn subreddit and note \"from /f/pics\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/earthporn",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "from /f/pics"
+ ]
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "92411be6af4e9ad5ccd3ccbaa01c10457bb00e704e99c58dd430de1a958307fd.jpg"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 11
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 618,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pics",
+ "geolocation": null,
+ "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"",
+ "instantiation_dict": {
+ "content": "Wife's costume",
+ "subreddit": "funny"
+ },
+ "intent": "Re-post the image of Wife's costume in this page to funny subreddit and note \"from /f/pics\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/funny",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "from /f/pics"
+ ]
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "6bfbf1280d28d84a9261695f0cac5a90addaaff1174807a2b381fdc159f3ed00.jpg"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 11
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 619,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pics",
+ "geolocation": null,
+ "intent_template": "Re-post the image of {{content}} in this page to {{subreddit}} subreddit and note \"from /f/pics\"",
+ "instantiation_dict": {
+ "content": "costume contest",
+ "subreddit": "funny"
+ },
+ "intent": "Re-post the image of costume contest in this page to funny subreddit and note \"from /f/pics\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/funny",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "from /f/pics"
+ ]
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "[...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "bd8bc5f4c846aac4df08626faa3a34a7d47c8f3bdd92bf615a54afd939f063a7.jpg"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 11
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 620,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for advice about {{issue}} in a subreddit for relations",
+ "instantiation_dict": {
+ "issue": "deal with long-distance relationships"
+ },
+ "intent": "Ask for advice about deal with long-distance relationships in a subreddit for relations",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 12
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 621,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for advice about {{issue}} in a subreddit for relations",
+ "instantiation_dict": {
+ "issue": "cheat"
+ },
+ "intent": "Ask for advice about cheat in a subreddit for relations",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 12
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 622,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for advice about {{issue}} in a subreddit for relations",
+ "instantiation_dict": {
+ "issue": "sexual harassment"
+ },
+ "intent": "Ask for advice about sexual harassment in a subreddit for relations",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 12
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 623,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for advice about {{issue}} in a subreddit for relations",
+ "instantiation_dict": {
+ "issue": "gift for birthday"
+ },
+ "intent": "Ask for advice about gift for birthday in a subreddit for relations",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 12
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 624,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for advice about {{issue}} in a subreddit for relations",
+ "instantiation_dict": {
+ "issue": "break-up remedy"
+ },
+ "intent": "Ask for advice about break-up remedy in a subreddit for relations",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 12
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 625,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "instantiation_dict": {
+ "topic": "the effectiveness of online learning"
+ },
+ "intent": "Create a discussion post about \"the effectiveness of online learning\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 13
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 626,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "instantiation_dict": {
+ "topic": "Iphone 14"
+ },
+ "intent": "Create a discussion post about \"Iphone 14\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/iphone",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 13
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 627,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "instantiation_dict": {
+ "topic": "Harry Potter movie series"
+ },
+ "intent": "Create a discussion post about \"Harry Potter movie series\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/movies",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 13
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 628,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "instantiation_dict": {
+ "topic": "long distance relationship"
+ },
+ "intent": "Create a discussion post about \"long distance relationship\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/relationship_advice",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 13
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 629,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Create a discussion post about \"{{topic}}\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "instantiation_dict": {
+ "topic": "Fun thing to do in Pittsburgh"
+ },
+ "intent": "Create a discussion post about \"Fun thing to do in Pittsburgh\" in a relevant subreddit and ask users for their opinions with the simple prompt, \"your opinion\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/pittsburgh",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 13
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 630,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}",
+ "instantiation_dict": {
+ "category": "noise-cancelling headphones",
+ "price": "$200",
+ "subreddit": "r/headphones"
+ },
+ "intent": "Ask for product recommendations for noise-cancelling headphones within a budget of $200 in r/headphones",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/headphones",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 15
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 631,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}",
+ "instantiation_dict": {
+ "category": "running shoes",
+ "price": "$100",
+ "subreddit": "r/sports"
+ },
+ "intent": "Ask for product recommendations for running shoes within a budget of $100 in r/sports",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/sports",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 15
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 632,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}",
+ "instantiation_dict": {
+ "category": "running shoes",
+ "price": "$500",
+ "subreddit": "r/sports"
+ },
+ "intent": "Ask for product recommendations for running shoes within a budget of $500 in r/sports",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/sports",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 15
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 633,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}",
+ "instantiation_dict": {
+ "category": "running pants",
+ "price": "$500",
+ "subreddit": "r/sports"
+ },
+ "intent": "Ask for product recommendations for running pants within a budget of $500 in r/sports",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/sports",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 15
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 634,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Ask for product recommendations for {{category}} within a budget of {{price}} in {{subreddit}}",
+ "instantiation_dict": {
+ "category": "used iphone",
+ "price": "$1000",
+ "subreddit": "r/iphone"
+ },
+ "intent": "Ask for product recommendations for used iphone within a budget of $1000 in r/iphone",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/iphone",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 15
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 635,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}",
+ "instantiation_dict": {
+ "category": "noise-cancelling headphones",
+ "price": "$200"
+ },
+ "intent": "Post in the most appropriate subreddit and ask for recommendations for noise-cancelling headphones products within a budget of $200",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/headphones",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 6100
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 636,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}",
+ "instantiation_dict": {
+ "category": "DIY toolkit",
+ "price": "$100"
+ },
+ "intent": "Post in the most appropriate subreddit and ask for recommendations for DIY toolkit products within a budget of $100",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/DIY",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 6100
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 637,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}",
+ "instantiation_dict": {
+ "category": "sony headphones",
+ "price": "$500"
+ },
+ "intent": "Post in the most appropriate subreddit and ask for recommendations for sony headphones products within a budget of $500",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/headphones",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 6100
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 638,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}",
+ "instantiation_dict": {
+ "category": "must-have product in my life",
+ "price": "$30"
+ },
+ "intent": "Post in the most appropriate subreddit and ask for recommendations for must-have product in my life products within a budget of $30",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/BuyItForLife",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 6100
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 639,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in the most appropriate subreddit and ask for recommendations for {{category}} products within a budget of {{price}}",
+ "instantiation_dict": {
+ "category": "used iphone",
+ "price": "$1000"
+ },
+ "intent": "Post in the most appropriate subreddit and ask for recommendations for used iphone products within a budget of $1000",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/iphone",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 6100
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 640,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "interest": "book reading",
+ "date": "March 15th",
+ "subreddit": "r/books"
+ },
+ "intent": "Post a notice on a virtual meetup for book reading enthusiasts on March 15th in the r/books subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "book reading",
+ "March 15th",
+ "virtual meetup"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 16
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 641,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "interest": "Harry Potter",
+ "date": "July 8th",
+ "subreddit": "books"
+ },
+ "intent": "Post a notice on a virtual meetup for Harry Potter enthusiasts on July 8th in the books subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 16
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 642,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "interest": "Big little lies",
+ "date": "Sep 10th",
+ "subreddit": "books"
+ },
+ "intent": "Post a notice on a virtual meetup for Big little lies enthusiasts on Sep 10th in the books subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 16
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 643,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "interest": "racing cars",
+ "date": "Oct 21st",
+ "subreddit": "nyc"
+ },
+ "intent": "Post a notice on a virtual meetup for racing cars enthusiasts on Oct 21st in the nyc subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/nyc",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 16
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 644,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post a notice on a virtual meetup for {{interest}} enthusiasts on {{date}} in the {{subreddit}} subreddit",
+ "instantiation_dict": {
+ "interest": "Tears of Kingdom",
+ "date": "Dec 15th",
+ "subreddit": "games"
+ },
+ "intent": "Post a notice on a virtual meetup for Tears of Kingdom enthusiasts on Dec 15th in the games subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/games",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 16
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 645,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in {{subreddit}} subreddit about how could machine learning help the corresponding field.",
+ "instantiation_dict": {
+ "subreddit": "books"
+ },
+ "intent": "Post in books subreddit about how could machine learning help the corresponding field.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 19
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 646,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in {{subreddit}} subreddit about how could midjourney help the corresponding field.",
+ "instantiation_dict": {
+ "subreddit": "DIY"
+ },
+ "intent": "Post in DIY subreddit about how could midjourney help the corresponding field.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/diy",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": [
+ {
+ "question": "Does this passage inquire about how could midjourney help DIY?",
+ "answer": "yes"
+ }
+ ]
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 19
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 647,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in {{subreddit}} forum about how could open-source LLMs help the corresponding field.",
+ "instantiation_dict": {
+ "subreddit": "technology"
+ },
+ "intent": "Post in technology forum about how could open-source LLMs help the corresponding field.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/technology",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 19
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 648,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in {{subreddit}} forum about how could large language models help the corresponding field.",
+ "instantiation_dict": {
+ "subreddit": "dataisbeautiful"
+ },
+ "intent": "Post in dataisbeautiful forum about how could large language models help the corresponding field.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/dataisbeautiful",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 19
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 649,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Post in {{subreddit}} subreddit about how could diffusion model help the corresponding field.",
+ "instantiation_dict": {
+ "subreddit": "history"
+ },
+ "intent": "Post in history subreddit about how could diffusion model help the corresponding field.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/history",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 19
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 650,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists",
+ "geolocation": null,
+ "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"",
+ "instantiation_dict": {
+ "position_description": "the post",
+ "content_description": "I am a big fan of the bookorg"
+ },
+ "intent": "Reply to the post with my comment \"I am a big fan of the bookorg\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.comment__body').outerText",
+ "required_contents": {
+ "exact_match": "I am a big fan of the bookorg"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 23
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 651,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign",
+ "geolocation": null,
+ "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"",
+ "instantiation_dict": {
+ "position_description": "the post",
+ "content_description": "Yeah, pittsburgh traffic, you know..."
+ },
+ "intent": "Reply to the post with my comment \"Yeah, pittsburgh traffic, you know...\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/pittsburgh/45899/driving-in-pittsburgh-summed-up-by-one-traffic-sign",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.comment__body').outerText",
+ "required_contents": {
+ "exact_match": "Yeah, pittsburgh traffic, you know..."
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 23
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 652,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research",
+ "geolocation": null,
+ "intent_template": "Reply to {{position_description}} with my comment \"{{content_description}}\"",
+ "instantiation_dict": {
+ "position_description": "the post",
+ "content_description": "???"
+ },
+ "intent": "Reply to the post with my comment \"???\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/technology/134852/ai-experts-disown-musk-backed-campaign-citing-their-research",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.comment__body').outerText",
+ "required_contents": {
+ "exact_match": "???"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 23
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 653,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.",
+ "instantiation_dict": {
+ "product": "phone screen protector",
+ "order_id": "000000180"
+ },
+ "intent": "Fill the \"contact us\" form in the site for a refund on the phone screen protector I bought, stating that it broke after just three days of use. Also, ensure to include the order number #000000180 and the product SKU. Don't submit yet, I will check.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 153
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 654,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.",
+ "instantiation_dict": {
+ "product": "bluetooth speaker",
+ "order_id": "161"
+ },
+ "intent": "Fill the \"contact us\" form in the site for a refund on the bluetooth speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #161 and the product SKU. Don't submit yet, I will check.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 153
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 655,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.",
+ "instantiation_dict": {
+ "product": "iphone case",
+ "order_id": "180"
+ },
+ "intent": "Fill the \"contact us\" form in the site for a refund on the iphone case I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 153
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 656,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.",
+ "instantiation_dict": {
+ "product": "remote controller",
+ "order_id": "180"
+ },
+ "intent": "Fill the \"contact us\" form in the site for a refund on the remote controller I bought, stating that it broke after just three days of use. Also, ensure to include the order number #180 and the product SKU. Don't submit yet, I will check.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 153
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 657,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Fill the \"contact us\" form in the site for a refund on the {{product}} I bought, stating that it broke after just three days of use. Also, ensure to include the order number #{{order_id}} and the product SKU. Don't submit yet, I will check.",
+ "instantiation_dict": {
+ "product": "speaker",
+ "order_id": "148"
+ },
+ "intent": "Fill the \"contact us\" form in the site for a refund on the speaker I bought, stating that it broke after just three days of use. Also, ensure to include the order number #148 and the product SKU. Don't submit yet, I will check.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "answer": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 153
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 658,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}",
+ "instantiation_dict": {
+ "repo": "a11yproject",
+ "issue": "401 bad gateway",
+ "account": "Roshanjossey",
+ "due": "the end of 2030"
+ },
+ "intent": "Create an issue in a11yproject repo with title \"401 bad gateway\". Assign the issue to Roshanjossey. Set due date to be the end of 2030",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText",
+ "required_contents": {
+ "exact_match": "401 bad gateway"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Dec 31, 2030"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.assignee').outerText",
+ "required_contents": {
+ "must_include": [
+ "Roshan Jossey"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 327
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 659,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}",
+ "instantiation_dict": {
+ "repo": "empathy-prompts",
+ "issue": "Integrating LLMs for better prompts",
+ "account": "Roshanjossey",
+ "due": "the beginning of Q2 2033"
+ },
+ "intent": "Create an issue in empathy-prompts repo with title \"Integrating LLMs for better prompts\". Assign the issue to Roshanjossey. Set due date to be the beginning of Q2 2033",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/empathy-prompts/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText",
+ "required_contents": {
+ "exact_match": "Integrating LLMs for better prompts"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Apr 1, 2033"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.assignee').outerText",
+ "required_contents": {
+ "must_include": [
+ "Roshan Jossey"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 327
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 660,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}",
+ "instantiation_dict": {
+ "repo": "dotfiles",
+ "issue": "add support for oh-my-zsh",
+ "account": "Abishek",
+ "due": "July 18 2033"
+ },
+ "intent": "Create an issue in dotfiles repo with title \"add support for oh-my-zsh\". Assign the issue to Abishek. Set due date to be July 18 2033",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/dotfiles/-/tree/main",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText",
+ "required_contents": {
+ "exact_match": "add support for oh-my-zsh"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Jul 18, 2033"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.assignee').outerText",
+ "required_contents": {
+ "must_include": [
+ "Abishek S"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 327
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 661,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open an issue to {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "ChatGPT",
+ "issue": "report the issue of connection refused"
+ },
+ "intent": "Open an issue to report the issue of connection refused in ChatGPT.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/convexegg/chatgpt/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "connection refused"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 662,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open an issue to {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "aem-hacker",
+ "issue": "report experiencing \"OSError: [Errno 98] Address already in use\" during executions"
+ },
+ "intent": "Open an issue to report experiencing \"OSError: [Errno 98] Address already in use\" during executions in aem-hacker.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/0ang3el/aem-hacker/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "OSError: [Errno 98] Address already in use"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 663,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open an issue to {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "metaseq",
+ "issue": "ask their plan on supporting Llama and other llama family models"
+ },
+ "intent": "Open an issue to ask their plan on supporting Llama and other llama family models in metaseq.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/root/metaseq/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "llama"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 664,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open an issue to {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "awesome-python",
+ "issue": "ask their plans on adding Python 3.11 related resources"
+ },
+ "intent": "Open an issue to ask their plans on adding Python 3.11 related resources in awesome-python.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/vinta/awesome-python/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "Python 3.11"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 665,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Open an issue to {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "a11y-syntax-highlighting",
+ "issue": "request adding support for MT theme editor"
+ },
+ "intent": "Open an issue to request adding support for MT theme editor in a11y-syntax-highlighting.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/a11y-syntax-highlighting/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "MT theme editor"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 666,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Submit a request to merge {{source_branch}} branch into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "dialog-component",
+ "target_branch": "dialog",
+ "reviewer": "Carol"
+ },
+ "intent": "Submit a request to merge dialog-component branch into dialog branch, assign Carol as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "dialog"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "dialog-component"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Caroline Stewart"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 667,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "dialog-component",
+ "target_branch": "bump-doctocat",
+ "reviewer": "primer"
+ },
+ "intent": "Submit a merge request for dialog-component branch to be merged into bump-doctocat branch, assign primer as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "bump-doctocat"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "dialog-component"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Primer"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 668,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "a11yproject.com/redesign",
+ "target_branch": "master",
+ "reviewer": "Justin Armstrong"
+ },
+ "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into master branch, assign Justin Armstrong as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "redesign"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "main"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Justin Armstrong"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 669,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/solarized-prism-theme",
+ "geolocation": null,
+ "intent_template": "Open a new issue to discuss the implementation of {{feature}}",
+ "instantiation_dict": {
+ "feature": "dark mode"
+ },
+ "intent": "Open a new issue to discuss the implementation of dark mode",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/solarized-prism-theme/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "implementation",
+ "dark mode"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 337
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 670,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/byteblaze/dotfiles",
+ "geolocation": null,
+ "intent_template": "Open a new issue to discuss the implementation of {{feature}}",
+ "instantiation_dict": {
+ "feature": "default plugins for .zsh"
+ },
+ "intent": "Open a new issue to discuss the implementation of default plugins for .zsh",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/dotfiles/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.detail-page-description').outerText",
+ "required_contents": {
+ "must_include": [
+ "implementation",
+ "default plugins",
+ "zsh"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 337
+ },
+ {
+ "sites": [
+ "shopping",
+ "reddit"
+ ],
+ "task_id": 671,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"",
+ "instantiation_dict": {
+ "product": "Sony Computer Entertainment VR",
+ "rating": "2 stars and less"
+ },
+ "intent": "Gather the titles of Sony Computer Entertainment VR reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Sony Computer Entertainment VR\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/gaming",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__title').outerText",
+ "required_contents": {
+ "exact_match": "real user feedback on Sony Computer Entertainment VR"
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "didn't last a year without issues",
+ "Disappointing. Didn't last long before it stopped powering on and needed to be sent in for repair.",
+ "Received used items!!"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 101
+ },
+ {
+ "sites": [
+ "shopping",
+ "reddit"
+ ],
+ "task_id": 672,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"",
+ "instantiation_dict": {
+ "product": "Nintendo Switch Fortnite Wildcat Console EU",
+ "rating": "3 stars and less"
+ },
+ "intent": "Gather the titles of Nintendo Switch Fortnite Wildcat Console EU reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Nintendo Switch Fortnite Wildcat Console EU\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/gaming",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__title').outerText",
+ "required_contents": {
+ "exact_match": "real user feedback on Nintendo Switch Fortnite Wildcat Console EU"
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "EU charger and wild cat card doesn\u2019t even work!",
+ "REFUND REJECTED",
+ "Charging port not compatible",
+ "not compatible in the US",
+ "Wildcard Bonus Credits Not Redeemable!",
+ "Code not available!!"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 101
+ },
+ {
+ "sites": [
+ "shopping",
+ "reddit"
+ ],
+ "task_id": 673,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"",
+ "instantiation_dict": {
+ "product": "Racing Wheel Overdrive for Xbox X",
+ "rating": "1 star"
+ },
+ "intent": "Gather the titles of Racing Wheel Overdrive for Xbox X reviews with 1 star rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Racing Wheel Overdrive for Xbox X\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/gaming",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__title').outerText",
+ "required_contents": {
+ "exact_match": "real user feedback on Racing Wheel Overdrive for Xbox X"
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "Unable to set neutral steering",
+ "Doesn\u2019t work with PC",
+ "Crazy problems in automatic mode",
+ "pedals stopped working",
+ "Only works with certain games"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 101
+ },
+ {
+ "sites": [
+ "shopping",
+ "reddit"
+ ],
+ "task_id": 674,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"",
+ "instantiation_dict": {
+ "product": "Doc and Pies Arcade Factory Cocktail Arcade Machine",
+ "rating": "3 stars and less"
+ },
+ "intent": "Gather the titles of Doc and Pies Arcade Factory Cocktail Arcade Machine reviews with 3 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/gaming",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__title').outerText",
+ "required_contents": {
+ "exact_match": "real user feedback on Doc and Pies Arcade Factory Cocktail Arcade Machine"
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "Poorly Made Exterior. Consider a different Company.",
+ "piece of junk ,..can't believe I spent money on this !!!!",
+ "Based arrived broken but game itself works"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 101
+ },
+ {
+ "sites": [
+ "shopping",
+ "reddit"
+ ],
+ "task_id": 675,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Gather the titles of {{product}} reviews with {{rating}} rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on {{product}}\"",
+ "instantiation_dict": {
+ "product": "HORI 3D Surround Gaming Neckset",
+ "rating": "2 stars and less"
+ },
+ "intent": "Gather the titles of HORI 3D Surround Gaming Neckset reviews with 2 stars and less rating from OneStopShop, and post them in the games subreddit under the title \"real user feedback on HORI 3D Surround Gaming Neckset\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/gaming",
+ "program_html": [
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__title').outerText",
+ "required_contents": {
+ "exact_match": "real user feedback on HORI 3D Surround Gaming Neckset"
+ }
+ },
+ {
+ "url": "func:reddit_get_post_url('__last_url__')",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "Not worth it for PC users",
+ "I really wanted to like this.",
+ "I wish this was better..."
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 101
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 676,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Lookup orders that are {{status}}",
+ "instantiation_dict": {
+ "status": "suspected of being fraudulent"
+ },
+ "intent": "Lookup orders that are suspected of being fraudulent",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales/order/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText",
+ "required_contents": {
+ "must_include": [
+ "Suspected Fraud"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 253
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 677,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Lookup orders that are {{status}}",
+ "instantiation_dict": {
+ "status": "processing"
+ },
+ "intent": "Lookup orders that are processing",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales/order/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText",
+ "required_contents": {
+ "must_include": [
+ "Processing"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 253
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 678,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Lookup orders that are {{status}}",
+ "instantiation_dict": {
+ "status": "canceled"
+ },
+ "intent": "Lookup orders that are canceled",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales/order/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText",
+ "required_contents": {
+ "must_include": [
+ "Canceled"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 253
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 679,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Lookup orders that are {{status}}",
+ "instantiation_dict": {
+ "status": "completed"
+ },
+ "intent": "Lookup orders that are completed",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales/order/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText",
+ "required_contents": {
+ "must_include": [
+ "Completed"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 253
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 680,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Lookup orders that are {{status}}",
+ "instantiation_dict": {
+ "status": "on hold"
+ },
+ "intent": "Lookup orders that are on hold",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales/order/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div.admin__data-grid-filters-current\").outerText",
+ "required_contents": {
+ "must_include": [
+ "On Hold"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 253
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 681,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit",
+ "instantiation_dict": {
+ "topic": "gan implementation"
+ },
+ "intent": "Find a GitLab repository related to gan implementation and make a Reddit post linking to it in a relevant subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "gan implementation",
+ "__GITLAB__/eriklindernoren/PyTorch-GAN"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 116
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 682,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit",
+ "instantiation_dict": {
+ "topic": "chatGPT"
+ },
+ "intent": "Find a GitLab repository related to chatGPT and make a Reddit post linking to it in a relevant subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/singularity |OR| __REDDIT__/f/deeplearning |OR| __REDDIT__/f/technology",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "chatgpt",
+ "__GITLAB__/convexegg/chatgpt"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 116
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 683,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Find a GitLab repository related to {{topic}} and make a Reddit post linking to it in a relevant subreddit",
+ "instantiation_dict": {
+ "topic": "metaseq"
+ },
+ "intent": "Find a GitLab repository related to metaseq and make a Reddit post linking to it in a relevant subreddit",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/machinelearning |OR| __REDDIT__/f/deeplearning",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText",
+ "required_contents": {
+ "must_include": [
+ "metaseq",
+ "__GITLAB__/root/metaseq"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 116
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 684,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.",
+ "instantiation_dict": {
+ "repo": "byteblaze/cloud-to-butt",
+ "subreddit": "LifeProTips"
+ },
+ "intent": "Promote byteblaze/cloud-to-butt to subreddit LifeProTips with the description from the repo itself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/LifeProTips",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "Chrome extension that replaces occurrences of 'the cloud' with 'my butt'",
+ "__GITLAB__/byteblaze/cloud-to-butt"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 117
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 685,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.",
+ "instantiation_dict": {
+ "repo": "byteblaze/dotfiles",
+ "subreddit": "aww"
+ },
+ "intent": "Promote byteblaze/dotfiles to subreddit aww with the description from the repo itself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/aww",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "Computer setup",
+ "__GITLAB__/byteblaze/dotfiles"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 117
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 686,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.",
+ "instantiation_dict": {
+ "repo": "auth0/angular-storage",
+ "subreddit": "technology"
+ },
+ "intent": "Promote auth0/angular-storage to subreddit technology with the description from the repo itself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/technology",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "A storage library for AngularJS done right",
+ "__GITLAB__/auth0/angular-storage"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 117
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 687,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.",
+ "instantiation_dict": {
+ "repo": "koush/AndroidAsync",
+ "subreddit": "funny"
+ },
+ "intent": "Promote koush/AndroidAsync to subreddit funny with the description from the repo itself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/funny",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "Asynchronous socket, http(s) (client+server) and websocket library for android. Based on nio, not threads.",
+ "__GITLAB__/koush/AndroidAsync"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 117
+ },
+ {
+ "sites": [
+ "reddit",
+ "gitlab"
+ ],
+ "task_id": 688,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Promote {{repo}} to subreddit {{subreddit}} with the description from the repo itself.",
+ "instantiation_dict": {
+ "repo": "lahwaacz/arch-wiki-docs",
+ "subreddit": "science"
+ },
+ "intent": "Promote lahwaacz/arch-wiki-docs to subreddit science with the description from the repo itself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__REDDIT__/f/science",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('.submission__inner').outerText + [...document.querySelector('.submission__inner').querySelectorAll('[href],[src]')].map(elem => elem.getAttribute('href') || elem.getAttribute('src')).join(' ')",
+ "required_contents": {
+ "must_include": [
+ "A script to download pages from Arch Wiki for offline browsing",
+ "__GITLAB__/lahwaacz/arch-wiki-docs"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 117
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 689,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}",
+ "instantiation_dict": {
+ "reason": "I am a loyal customer"
+ },
+ "intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a loyal customer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 163
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 690,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}",
+ "instantiation_dict": {
+ "reason": "they promised me a coupon last time"
+ },
+ "intent": "Draft an email to the shop owner via their contact us function for a coupon as they promised me a coupon last time",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 163
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 691,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}",
+ "instantiation_dict": {
+ "reason": "I plan to make a bulk purchase"
+ },
+ "intent": "Draft an email to the shop owner via their contact us function for a coupon as I plan to make a bulk purchase",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 163
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 692,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}",
+ "instantiation_dict": {
+ "reason": "I am a student"
+ },
+ "intent": "Draft an email to the shop owner via their contact us function for a coupon as I am a student",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 163
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 693,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Draft an email to the shop owner via their contact us function for a coupon as {{reason}}",
+ "instantiation_dict": {
+ "reason": "my refund is suppoed to be replaced by a coupon"
+ },
+ "intent": "Draft an email to the shop owner via their contact us function for a coupon as my refund is suppoed to be replaced by a coupon",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING__/contact",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[title=\"What\u2019s on your mind?\"').value",
+ "required_contents": {
+ "qa": [
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "answer": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "answer": "yes"
+ }
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 163
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 694,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}",
+ "instantiation_dict": {
+ "product": "Energy-Bulk Women Shirt",
+ "stock": "50",
+ "size": "S",
+ "color": "blue",
+ "price": "60"
+ },
+ "intent": "Add a simple product named Energy-Bulk Women Shirt with 50 in stock, available in size S and color blue, priced at $60",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/catalog/product",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "60.00"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[name]\"').value",
+ "required_contents": {
+ "must_include": [
+ "Energy-Bulk Women Shirt"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "50"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-role=\"selected-option\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "top"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[size]\"').value",
+ "required_contents": {
+ "exact_match": "167"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[color]\"').value",
+ "required_contents": {
+ "exact_match": "50"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-index=\"category_ids\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "tops"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 256
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 695,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}",
+ "instantiation_dict": {
+ "product": "Energy-Bulk Man Yoga Pant",
+ "stock": "50",
+ "size": "38",
+ "color": "yellow",
+ "price": "69.99"
+ },
+ "intent": "Add a simple product named Energy-Bulk Man Yoga Pant with 50 in stock, available in size 38 and color yellow, priced at $69.99",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/catalog/product",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "69.99"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[name]\"').value",
+ "required_contents": {
+ "must_include": [
+ "Energy-Bulk Man Yoga Pant"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "50"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-role=\"selected-option\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "bottom"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[size]\"').value",
+ "required_contents": {
+ "exact_match": "179"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[color]\"').value",
+ "required_contents": {
+ "exact_match": "60"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-index=\"category_ids\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "bottoms"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 256
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 696,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}",
+ "instantiation_dict": {
+ "product": "FancyBoy Man Causal Jeans",
+ "stock": "42",
+ "size": "34",
+ "color": "Blue",
+ "price": "169.99"
+ },
+ "intent": "Add a simple product named FancyBoy Man Causal Jeans with 42 in stock, available in size 34 and color Blue, priced at $169.99",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/catalog/product",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[name]\"').value",
+ "required_contents": {
+ "must_include": [
+ "FancyBoy Man Causal Jeans"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "42"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "169.99"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-role=\"selected-option\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "bottom"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[size]\"').value",
+ "required_contents": {
+ "exact_match": "177"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[color]\"').value",
+ "required_contents": {
+ "exact_match": "50"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-index=\"category_ids\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "bottoms"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 256
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 697,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}",
+ "instantiation_dict": {
+ "product": "Swaatch Smart Watch",
+ "stock": "42",
+ "size": "uni-size",
+ "color": "Blue",
+ "price": "769.99"
+ },
+ "intent": "Add a simple product named Swaatch Smart Watch with 42 in stock, available in size uni-size and color Blue, priced at $769.99",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/catalog/product",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[name]\"').value",
+ "required_contents": {
+ "must_include": [
+ "Swaatch Smart Watch"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "42"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "769.99"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-role=\"selected-option\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "gear"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[color]\"').value",
+ "required_contents": {
+ "exact_match": "50"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-index=\"category_ids\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "watches"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 256
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 698,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Add a simple product named {{product}} with {{stock}} in stock, available in size {{size}} and color {{color}}, priced at ${{price}}",
+ "instantiation_dict": {
+ "product": "Lelelumon Yoga Mat",
+ "stock": "42",
+ "size": "uni-size",
+ "color": "black",
+ "price": "769.99"
+ },
+ "intent": "Add a simple product named Lelelumon Yoga Mat with 42 in stock, available in size uni-size and color black, priced at $769.99",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/catalog/product",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[name]\"').value",
+ "required_contents": {
+ "must_include": [
+ "Lelelumon Yoga Mat"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "42"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "769.99"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-role=\"selected-option\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "gear"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"product[color]\"').value",
+ "required_contents": {
+ "exact_match": "49"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-index=\"category_ids\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "fitness equipment"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 256
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 699,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers",
+ "instantiation_dict": {
+ "topic": "spring sale",
+ "rule": "a 20 percent discount site-wide"
+ },
+ "intent": "Draft a new marketing price rule for spring sale that offers a 20 percent discount site-wide for all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"[name='name'\").value",
+ "required_contents": {
+ "must_include": [
+ "spring sale"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"simple_action\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "by_percent"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"discount_amount\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "20"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 258
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 700,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers",
+ "instantiation_dict": {
+ "topic": "fall discount",
+ "rule": "$10 discount on checkout"
+ },
+ "intent": "Draft a new marketing price rule for fall discount that offers $10 discount on checkout for all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"[name='name'\").value",
+ "required_contents": {
+ "must_include": [
+ "fall discount"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"simple_action\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "cart_fixed"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"discount_amount\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "10"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 258
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 701,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers",
+ "instantiation_dict": {
+ "topic": "Mother's day sale",
+ "rule": "$15 discount on checkout"
+ },
+ "intent": "Draft a new marketing price rule for Mother's day sale that offers $15 discount on checkout for all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"[name='name'\").value",
+ "required_contents": {
+ "must_include": [
+ "Mother's day sale"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"simple_action\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "cart_fixed"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"discount_amount\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "15"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 258
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 702,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers",
+ "instantiation_dict": {
+ "topic": "Pride Month",
+ "rule": "45% off on all products"
+ },
+ "intent": "Draft a new marketing price rule for Pride Month that offers 45% off on all products for all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"[name='name'\").value",
+ "required_contents": {
+ "must_include": [
+ "Pride Month"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"simple_action\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "by_percent"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"discount_amount\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "45"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 258
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 703,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Draft a new marketing price rule for {{topic}} that offers {{rule}} for all customers",
+ "instantiation_dict": {
+ "topic": "Thanks giving sale",
+ "rule": "$40 discount on checkout"
+ },
+ "intent": "Draft a new marketing price rule for Thanks giving sale that offers $40 discount on checkout for all customers",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/sales_rule/promo_quote",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"[name='name'\").value",
+ "required_contents": {
+ "must_include": [
+ "Thanks giving sale"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"website_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "0"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"customer_group_ids\"').selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"simple_action\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "cart_fixed"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"discount_amount\"').value",
+ "prep_actions": [
+ "document.querySelector('[data-index=\"actions\"]').querySelector('.admin__collapsible-title').click()"
+ ],
+ "required_contents": {
+ "exact_match": "40"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 258
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 704,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}",
+ "instantiation_dict": {
+ "report": "sales order report",
+ "time_span": "for last month"
+ },
+ "intent": "Today is 3/15/2023, generate a sales order report for last month",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "2/1/23"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "2/28/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 268
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 705,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}",
+ "instantiation_dict": {
+ "report": "sales order report",
+ "time_span": "over the last 45 days"
+ },
+ "intent": "Today is 3/15/2023, generate a sales order report over the last 45 days",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "1/29/23"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "3/15/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 268
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 706,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}",
+ "instantiation_dict": {
+ "report": "refund report",
+ "time_span": "for Q1"
+ },
+ "intent": "Today is 3/15/2023, generate a refund report for Q1",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/refunded",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "1/1/23"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "3/31/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 268
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 707,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}",
+ "instantiation_dict": {
+ "report": "sales order report",
+ "time_span": "for last year"
+ },
+ "intent": "Today is 3/15/2023, generate a sales order report for last year",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "1/1/2022"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "12/31/2022"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 268
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 708,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Today is 3/15/2023, generate a {{report}} {{time_span}}",
+ "instantiation_dict": {
+ "report": "tax report",
+ "time_span": "for this year"
+ },
+ "intent": "Today is 3/15/2023, generate a tax report for this year",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/tax/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "1/1/2023"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "12/31/2023"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 268
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 709,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Create an {{type}} report from {{start_date}} to {{end_date}}",
+ "instantiation_dict": {
+ "type": "orders",
+ "start_date": "beginning of May 2021",
+ "end_date": "end of March 2022"
+ },
+ "intent": "Create an orders report from beginning of May 2021 to end of March 2022",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/sales",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "5/1/2021"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "3/31/2022"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 271
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 710,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}",
+ "instantiation_dict": {
+ "type": "shipping",
+ "start_date": "08/05/2022",
+ "end_date": "03/01/2023"
+ },
+ "intent": "Create a shipping report from 08/05/2022 to 03/01/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/shipping",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "8/5/22"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "3/1/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 271
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 711,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}",
+ "instantiation_dict": {
+ "type": "product view",
+ "start_date": "07/05/2021",
+ "end_date": "05/31/2023"
+ },
+ "intent": "Create a product view report from 07/05/2021 to 05/31/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_product/viewed/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "7/5/21"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "5/31/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 271
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 712,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}",
+ "instantiation_dict": {
+ "type": "coupons",
+ "start_date": "05/01/2021",
+ "end_date": "05/15/2023"
+ },
+ "intent": "Create a coupons report from 05/01/2021 to 05/15/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/coupons/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "5/1/21"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "5/15/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 271
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 713,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Create a {{type}} report from {{start_date}} to {{end_date}}",
+ "instantiation_dict": {
+ "type": "best sellers",
+ "start_date": "05/01/2022",
+ "end_date": "05/31/2023"
+ },
+ "intent": "Create a best sellers report from 05/01/2022 to 05/31/2023",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__SHOPPING_ADMIN__/reports/report_sales/bestsellers/",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_from\"').value",
+ "required_contents": {
+ "exact_match": "5/1/22"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[id=\"sales_report_to\"').value",
+ "required_contents": {
+ "exact_match": "5/31/23"
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 271
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 714,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.",
+ "instantiation_dict": {
+ "subreddit": "gadgets",
+ "k": "1"
+ },
+ "intent": "Thumbs down the top 1 post ever in gadgets.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/gadgets/19459/a-custom-gaming-pc-built-inside-a-vintage-1940s-motorola",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 24
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 715,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.",
+ "instantiation_dict": {
+ "subreddit": "history",
+ "k": "2"
+ },
+ "intent": "Thumbs down the top 2 post ever in history.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/history/84338/the-scientist-who-discovered-sperm-was-so-grossed-out-he",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/history/105990/4-500-year-old-sumerian-temple-dedicated-to-mighty-thunder",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 24
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 716,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.",
+ "instantiation_dict": {
+ "k": "3",
+ "subreddit": "books"
+ },
+ "intent": "Thumbs down the top 3 post ever in books.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/books/81371/the-letters-of-t-s-eliot-to-emily-hale-that-were-kept-sealed",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/books/59421/friendly-reminder-bookshop-org-exists",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/books/59447/appalachian-prison-book-project-seeks-notebook-donations-the",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 24
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 717,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.",
+ "instantiation_dict": {
+ "k": "4",
+ "subreddit": "movies"
+ },
+ "intent": "Thumbs down the top 4 post ever in movies.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/movies/86174/who-will-win-the-oscar-for-actress-in-a-supporting-role",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/movies/86029/who-will-win-the-oscar-for-film-editing",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/movies/86055/cindy-williams-dies-laverne-amp-shirley-star-who-appeared-in",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/movies/42682/michelle-yeoh-to-receive-palm-springs-film-festival-s",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 24
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 718,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Thumbs down the top {{k}} post ever in {{subreddit}}.",
+ "instantiation_dict": {
+ "k": "5",
+ "subreddit": "technology"
+ },
+ "intent": "Thumbs down the top 5 post ever in technology.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/technology/48670/brain-cancer-vaccine-succeeds-at-prolonging-survival-in",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/technology/134696/india-cuts-internet-for-27-million-people-amid-search-for",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/technology/48785/us-judge-orders-amazon-to-cease-and-desist-anti-union",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/technology/70354/activision-s-boston-studio-workers-announce-unionization",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/technology/70233/social-media-influencers-are-charged-with-feeding-followers",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 24
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 719,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "subreddit": "wallstreetbets",
+ "user": "ThetaGang_wsb"
+ },
+ "intent": "Like all submissions created by ThetaGang_wsb in subreddit wallstreetbets",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/wallstreetbets/29478/how-will-airbnb-close-following-their-earnings-report-on",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/wallstreetbets/29458/how-much-will-the-federal-reserve-raise-interest-rates-in",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 25
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 720,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "subreddit": "earthporn",
+ "user": "CameronKelsey"
+ },
+ "intent": "Like all submissions created by CameronKelsey in subreddit earthporn",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/EarthPorn/98332/my-favorite-place-on-the-planet-henry-s-fork-of-the-snake",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/EarthPorn/98297/2-years-later-this-is-still-one-of-the-most-incredible",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/EarthPorn/98256/i-can-t-wait-for-all-this-green-to-start-coming-back-little",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 25
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 721,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "UniversityofBath",
+ "subreddit": "IAmA"
+ },
+ "intent": "Like all submissions created by UniversityofBath in subreddit IAmA",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/IAmA/119742/hi-i-m-vienne-a-doctoral-student-at-the-university-of-bath-i",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/119719/hello-reddit-i-m-nazia-mehrban-a-lecturer-in-biotechnology",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/119714/i-m-ellie-jarvis-she-her-a-2nd-year-phd-student-in-the",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/55155/hi-i-m-dr-lucy-maddox-from-bath-university-uk-i-m-a-clinical",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/55142/we-re-sadeka-nujhat-hannah-leese-and-sandhya-moise-from-the",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/34032/we-re-sandhya-moise-david-phillips-and-chan-lee-from-the",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/13175/hi-i-m-kit-yates-i-m-a-mathematical-biologist-at-the",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/IAmA/13170/hello-i-m-dr-sara-fontani-from-the-university-of",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 25
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 722,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "Don_Gato1",
+ "subreddit": "new york"
+ },
+ "intent": "Like all submissions created by Don_Gato1 in subreddit new york",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/nyc/44650/fox-news-hosts-cast-new-york-as-crime-ridden-and-chaotic",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 25
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 723,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "FTorrez81",
+ "subreddit": "iphone13"
+ },
+ "intent": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "FTorrez81 does not have any submissions in iphone13"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "reference_answer_raw_annotation": "N/A",
+ "string_note": ""
+ },
+ "intent_template_id": 25,
+ "string_note": "FTorrez81 has no submissions in subreddit iphone13"
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 724,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Like all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "Hrekires",
+ "subreddit": "news"
+ },
+ "intent": "Like all submissions created by Hrekires in subreddit news",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-upvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 25
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 725,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "subreddit": "massachusetts",
+ "user": "RickyDontLoseThat"
+ },
+ "intent": "DisLike all submissions created by RickyDontLoseThat in subreddit massachusetts",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/massachusetts/84954/the-last-of-lincoln",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 726,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "subreddit": "earthporn",
+ "user": "jacyanthis"
+ },
+ "intent": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "jacyanthis does not have any submissions in earthporn"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 727,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "PatientBuilder499",
+ "subreddit": "videos"
+ },
+ "intent": "DisLike all submissions created by PatientBuilder499 in subreddit videos",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/videos/115139/hundreds-of-civilian-turkish-volunteers-waiting-to-be-sent",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 728,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "sirbarani",
+ "subreddit": "sports"
+ },
+ "intent": "DisLike all submissions created by sirbarani in subreddit sports",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/sports/48303/iran-football-legend-daei-will-not-attend-world-cup-amid",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 729,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "AdamCannon",
+ "subreddit": "UpliftingNews"
+ },
+ "intent": "DisLike all submissions created by AdamCannon in subreddit UpliftingNews",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/UpliftingNews/16087/same-sex-marriage-is-now-legal-in-all-of-mexico-s-states",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 730,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "DisLike all submissions created by {{user}} in subreddit {{subreddit}}",
+ "instantiation_dict": {
+ "user": "Hrekires",
+ "subreddit": "news"
+ },
+ "intent": "DisLike all submissions created by Hrekires in subreddit news",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/news/129816/gov-whitmer-signs-bills-to-repeal-right-to-work-restore",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129808/disney-world-deal-with-union-will-raise-minimum-wage-to-18",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129794/judge-halts-wyoming-abortion-ban-days-after-it-took-effect",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129783/don-t-say-gay-lawmaker-pleads-guilty-to-covid-relief-fraud",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129594/arizona-gov-katie-hobbs-refuses-to-proceed-with-execution",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/129508/tennessee-governor-oks-bill-to-cut-nashville-council-in-half",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43839/philadelphia-da-larry-krasner-impeached-by-pa-house",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43781/crypto-giant-ftx-to-file-for-bankruptcy-ceo-sam-bankman",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43572/sec-doj-investigating-crypto-platform-ftx",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ },
+ {
+ "url": "__REDDIT__/f/news/43558/kansas-gov-laura-kelly-wins-re-election-defeating-gop",
+ "locator": "document.querySelector('div.submission__vote').querySelector('form').getAttribute('class')",
+ "required_contents": {
+ "must_include": [
+ "vote vote--user-downvoted"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 1510
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 731,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"",
+ "instantiation_dict": {
+ "post": "Nvidia RTX 4090",
+ "content": "EDIT: This news aged well"
+ },
+ "intent": "Edit my post on Nvidia RTX 4090 by adding a line to the body that says \"EDIT: This news aged well\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/MachineLearning/1/nvidia-rtx-4090",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "must_include": [
+ "EDIT: This news aged well",
+ "Crazy device for ML!"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 27
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 732,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"",
+ "instantiation_dict": {
+ "post": "The Night Agent",
+ "content": "Done watching, pretty cool!"
+ },
+ "intent": "Edit my post on The Night Agent by adding a line to the body that says \"Done watching, pretty cool!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/television/134868/the-night-agent-renewed-for-season-2-at-netflix",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "exact_match": "Done watching, pretty cool!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 27
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 733,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"",
+ "instantiation_dict": {
+ "post": "Star Trek Starfleet Academy series",
+ "content": "Every watch makes me feel like a kid again"
+ },
+ "intent": "Edit my post on Star Trek Starfleet Academy series by adding a line to the body that says \"Every watch makes me feel like a kid again\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/television/135201/star-trek-starfleet-academy-series-from-alex-kurtzman-and",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "exact_match": "Every watch makes me feel like a kid again"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 27
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 734,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"",
+ "instantiation_dict": {
+ "post": "Ted Lasso",
+ "content": "Done watching. I love the renew!"
+ },
+ "intent": "Edit my post on Ted Lasso by adding a line to the body that says \"Done watching. I love the renew!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/television/135156/ted-lasso-season-3-premiere-scores-870k-u-s-households-up-59",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "exact_match": "Done watching. I love the renew!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 27
+ },
+ {
+ "sites": [
+ "reddit"
+ ],
+ "task_id": 735,
+ "require_login": true,
+ "storage_state": "./.auth/reddit_state.json",
+ "start_url": "__REDDIT__",
+ "geolocation": null,
+ "intent_template": "Edit my post on {{post}} by adding a line to the body that says \"{{content}}\"",
+ "instantiation_dict": {
+ "post": "Lord of the Rings",
+ "content": "The cast is amazing!"
+ },
+ "intent": "Edit my post on Lord of the Rings by adding a line to the body that says \"The cast is amazing!\"",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__REDDIT__/f/television/135152/lord-of-the-rings-the-rings-of-power-season-2-cast-adds",
+ "locator": "document.querySelector('.submission__body').outerText",
+ "required_contents": {
+ "exact_match": "The cast is amazing!"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 27
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 736,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make the LICENSE of {{repo}} to MIT license.",
+ "instantiation_dict": {
+ "repo": "gimmiethat.space and dotfiles"
+ },
+ "intent": "Make the LICENSE of gimmiethat.space and dotfiles to MIT license.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/gimmiethat.space/-/blob/main/LICENSE",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "MIT license",
+ "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software."
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/dotfiles/-/blob/main/LICENSE",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "MIT license",
+ "The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software."
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 355
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 737,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University",
+ "sport_team": "Philadelphia 76ers",
+ "time": ""
+ },
+ "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Wells Fargo Center",
+ "South Philadelphia Sports Complex"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 94
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 738,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University",
+ "sport_team": "Philadelphia 76ers",
+ "time": "in the 70th"
+ },
+ "intent": "Show me the way from Carnegie Mellon University to the home stadium of Philadelphia 76ers in the 70th",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "3601 South Broad Street",
+ "South Philadelphia"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 94
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 739,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University",
+ "sport_team": "Yankees",
+ "time": "in the 80th"
+ },
+ "intent": "Show me the way from Carnegie Mellon University to the home stadium of Yankees in the 80th",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Yankee Stadium",
+ "East 161st Street"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 94
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 740,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University",
+ "sport_team": "NYC NBA team",
+ "time": ""
+ },
+ "intent": "Show me the way from Carnegie Mellon University to the home stadium of NYC NBA team ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Madison Square Garden",
+ "Pennsylvania Plaza",
+ "Manhattan",
+ "New York"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 94
+ },
+ {
+ "sites": [
+ "wikipedia",
+ "map"
+ ],
+ "task_id": 741,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the way from {{location}} to the home stadium of {{sport_team}} {{time}}",
+ "instantiation_dict": {
+ "location": "Carnegie Mellon University",
+ "sport_team": "Boston home NBA team",
+ "time": ""
+ },
+ "intent": "Show me the way from Carnegie Mellon University to the home stadium of Boston home NBA team ",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "TD Garden",
+ "Boston",
+ "Massachusetts"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 94
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 742,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a new {{scope}} project \"planner\" and add {{account_list}} as members",
+ "instantiation_dict": {
+ "scope": "private",
+ "account_list": "Abishek, Vinta"
+ },
+ "intent": "Create a new private project \"planner\" and add Abishek, Vinta as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/planner",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/planner/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@abisubramanya27",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 743,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a new {{scope}} project \"web_arena\" and add {{account_list}} as members",
+ "instantiation_dict": {
+ "scope": "public",
+ "account_list": "Abishek, Vinta"
+ },
+ "intent": "Create a new public project \"web_arena\" and add Abishek, Vinta as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_arena",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "public"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_arena/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@abisubramanya27",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 744,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a new {{scope}} project \"AutoAGI\" and add {{account_list}} as members",
+ "instantiation_dict": {
+ "scope": "public",
+ "account_list": "primer"
+ },
+ "intent": "Create a new public project \"AutoAGI\" and add primer as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/AutoAGI",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "public"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/AutoAGI/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@primer"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 745,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a new {{scope}} project \"awesome-llms\" and add {{account_list}} as members",
+ "instantiation_dict": {
+ "scope": "public",
+ "account_list": "primer, convexegg, abishek"
+ },
+ "intent": "Create a new public project \"awesome-llms\" and add primer, convexegg, abishek as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/awesome-llms",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "public"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/awesome-llms/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@primer",
+ "@convexegg",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 746,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a new {{scope}} project \"llm_bulk_inference\" and add {{account_list}} as members",
+ "instantiation_dict": {
+ "scope": "private",
+ "account_list": "primer, convexegg, abishek"
+ },
+ "intent": "Create a new private project \"llm_bulk_inference\" and add primer, convexegg, abishek as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/llm_bulk_inference",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/llm_bulk_inference/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@primer",
+ "@convexegg",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 747,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members",
+ "instantiation_dict": {
+ "project_name": "awesome_web_agents",
+ "template": "blank",
+ "account_list": "Abishek, Vinta"
+ },
+ "intent": "Start a private project awesome_web_agents with blank template and add Abishek, Vinta as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/awesome_web_agents",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/awesome_web_agents/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initial commit"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/awesome_web_agents/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@abisubramanya27",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 2100
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 748,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members",
+ "instantiation_dict": {
+ "project_name": "web_agent_android_xl",
+ "template": "Android",
+ "account_list": "primer, convexegg, abishek"
+ },
+ "intent": "Start a private project web_agent_android_xl with Android template and add primer, convexegg, abishek as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_android_xl",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initialized from 'Android' project template"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_android_xl/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@primer",
+ "@convexegg",
+ "@abisubramanya27"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 2100
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 749,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members",
+ "instantiation_dict": {
+ "project_name": "project_site",
+ "template": "NodeJS",
+ "account_list": "primer, convexegg, vinta"
+ },
+ "intent": "Start a private project project_site with NodeJS template and add primer, convexegg, vinta as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/project_site",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/project_site/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initialized from 'NodeJS Express' project template"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/project_site/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@primer",
+ "@convexegg",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 2100
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 750,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members",
+ "instantiation_dict": {
+ "project_name": "agi_index",
+ "template": "HTML",
+ "account_list": "Vinta Chen"
+ },
+ "intent": "Start a private project agi_index with HTML template and add Vinta Chen as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/agi_index",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/agi_index",
+ "locator": "document.querySelector('.home-panel-description-markdown').outerText",
+ "required_contents": {
+ "must_include": [
+ "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features."
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/agi_index/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Vinta Chen"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 2100
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 751,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Start a private project {{project_name}} with {{template}} template and add {{account_list}} as members",
+ "instantiation_dict": {
+ "project_name": "AGISite",
+ "template": "JEKYLL",
+ "account_list": "Rohan and Vinta"
+ },
+ "intent": "Start a private project AGISite with JEKYLL template and add Rohan and Vinta as members",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/AGISite",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/AGISite",
+ "locator": "document.querySelector('.home-panel-description-markdown').outerText",
+ "required_contents": {
+ "must_include": [
+ "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features."
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/AGISite/-/project_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@Seirdy",
+ "@vinta"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 2100
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 752,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.",
+ "instantiation_dict": {
+ "project_name": "web_agent",
+ "template": "blank"
+ },
+ "intent": "Create a private blank repository called \"web_agent\" using the right template to speed up development.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_agent",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initial commit"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 753,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.",
+ "instantiation_dict": {
+ "project_name": "web_agent_android_xs",
+ "template": "Android"
+ },
+ "intent": "Create a private Android repository called \"web_agent_android_xs\" using the right template to speed up development.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_android_xs",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_android_xs/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initialized from 'Android' project template"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 754,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.",
+ "instantiation_dict": {
+ "project_name": "web_agent_nodejs",
+ "template": "NodeJS"
+ },
+ "intent": "Create a private NodeJS repository called \"web_agent_nodejs\" using the right template to speed up development.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_nodejs",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_nodejs/-/commits",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Initialized from 'NodeJS Express' project template"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 755,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.",
+ "instantiation_dict": {
+ "project_name": "web_agent_index",
+ "template": "HTML"
+ },
+ "intent": "Create a private HTML repository called \"web_agent_index\" using the right template to speed up development.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_index",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/web_agent_index",
+ "locator": "document.querySelector('.home-panel-description-markdown').outerText",
+ "required_contents": {
+ "must_include": [
+ "Example plain HTML site using GitLab Pages: https://pages.gitlab.io/plain-html |OR| A plain HTML site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features."
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 756,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create a private {{template}} repository called \"{{project_name}}\" using the right template to speed up development.",
+ "instantiation_dict": {
+ "project_name": "11711_gitlab",
+ "template": "JEKYLL"
+ },
+ "intent": "Create a private JEKYLL repository called \"11711_gitlab\" using the right template to speed up development.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/byteblaze/11711_gitlab",
+ "locator": "document.querySelector('.visibility-icon').getAttribute('title')",
+ "required_contents": {
+ "must_include": [
+ "Private"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/byteblaze/11711_gitlab",
+ "locator": "document.querySelector('.home-panel-description-markdown').outerText",
+ "required_contents": {
+ "must_include": [
+ "Example Jekyll site using GitLab Pages: https://pages.gitlab.io/jekyll |OR| A Jekyll site that uses Netlify for CI/CD instead of GitLab, but still with all the other great GitLab features."
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 332
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 757,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.",
+ "instantiation_dict": {
+ "city1": "home of the 1980 Super Bowl champions",
+ "city2": "home of the 1991 Super Bowl champions"
+ },
+ "intent": "Show me the path and travel time from home of the 1980 Super Bowl champions to home of the 1991 Super Bowl champions.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "New York"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 42
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 758,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the path and travel time from {{city1}} to {{city2}}.",
+ "instantiation_dict": {
+ "city1": "the big apple",
+ "city2": "biggest city in Maine"
+ },
+ "intent": "Show me the path and travel time from the big apple to biggest city in Maine.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "New York"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Portland",
+ "Maine"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 42
+ },
+ {
+ "sites": [
+ "map",
+ "shopping_admin"
+ ],
+ "task_id": 759,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}",
+ "instantiation_dict": {
+ "city1": "the city where my E-commerce customer Sophia Young lives",
+ "city2": "New York City"
+ },
+ "intent": "Show me the route and driving time from the city where my E-commerce customer Sophia Young lives to New York City",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Boston"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "New York"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 42
+ },
+ {
+ "sites": [
+ "map",
+ "shopping_admin"
+ ],
+ "task_id": 760,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Show me the route and driving time from {{city1}} to {{city2}}",
+ "instantiation_dict": {
+ "city1": "Allentown, PA",
+ "city2": "the city where my E-commerce customer Amanda Kim lives"
+ },
+ "intent": "Show me the route and driving time from Allentown, PA to the city where my E-commerce customer Amanda Kim lives",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Allentown"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Hoboken",
+ "New Jersey"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 42
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 761,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Science Museum",
+ "location/address_2": "Hunt library CMU",
+ "transportation": "walk"
+ },
+ "intent": "Get directions from Carnegie Science Museum to Hunt library CMU using walk options.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Science Center",
+ "Allegheny County",
+ "Pittsburgh"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Hunt Library",
+ "Pittsburgh"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 54
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 762,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Get directions from {{location/address_1}} to {{location/address_2}} using {{transportation}} options.",
+ "instantiation_dict": {
+ "location/address_1": "Carnegie Music Hall in NYC",
+ "location/address_2": "Carnegie Mellon University",
+ "transportation": "driving"
+ },
+ "intent": "Get directions from Carnegie Music Hall in NYC to Carnegie Mellon University using driving options.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Hall",
+ "West 57th Street",
+ "Manhattan",
+ "New York"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Carnegie Mellon University",
+ "Pittsburgh"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 54
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 763,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.",
+ "instantiation_dict": {
+ "store": "Trader Joe's",
+ "location": "401 Shady Ave, Pittsburgh"
+ },
+ "intent": "Find the walkway to the closest Trader Joe's from 401 Shady Ave, Pittsburgh.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "401, Shady Avenue, Shadyside"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Trader Joe's, 6343, Penn Avenue, East Liberty"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 75
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 764,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.",
+ "instantiation_dict": {
+ "store": "Target",
+ "location": "401 Shady Ave, Pittsburgh"
+ },
+ "intent": "Find the walkway to the closest Target from 401 Shady Ave, Pittsburgh.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "401, Shady Avenue, Shadyside"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Target, 6231, Penn Avenue, East Liberty"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 75
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 765,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.",
+ "instantiation_dict": {
+ "store": "Japanese food market",
+ "location": "401 Shady Ave, Pittsburgh"
+ },
+ "intent": "Find the walkway to the closest Japanese food market from 401 Shady Ave, Pittsburgh.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "401, Shady Avenue, Shadyside"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Tokyo Japanese Food Store, 5855, Ellsworth Avenue, Shadyside"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 75
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 766,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.",
+ "instantiation_dict": {
+ "store": "grocessory owned by Amazon",
+ "location": "401 Shady Ave, Pittsburgh"
+ },
+ "intent": "Find the walkway to the closest grocessory owned by Amazon from 401 Shady Ave, Pittsburgh.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "401, Shady Avenue, Shadyside"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Whole Foods Market, 5700, Penn Avenue, East Liberty"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 75
+ },
+ {
+ "sites": [
+ "map"
+ ],
+ "task_id": 767,
+ "require_login": true,
+ "storage_state": null,
+ "start_url": "__MAP__",
+ "geolocation": null,
+ "intent_template": "Find the walkway to the closest {{store}} from {{location}}.",
+ "instantiation_dict": {
+ "store": "chain grocessory owned by a local business",
+ "location": "401 Shady Ave, Pittsburgh"
+ },
+ "intent": "Find the walkway to the closest chain grocessory owned by a local business from 401 Shady Ave, Pittsburgh.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector(\"div#content select.routing_engines\").selectedIndex",
+ "required_contents": {
+ "exact_match": "2"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_from\"').value",
+ "required_contents": {
+ "must_include": [
+ "401, Shady Avenue, Shadyside"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[name=\"route_to\"').value",
+ "required_contents": {
+ "must_include": [
+ "Giant Eagle, 5550, Centre Avenue, Shadyside"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 75
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 768,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{quantity}} {{product}} arrived, update the stock",
+ "instantiation_dict": {
+ "quantity": "5",
+ "product": "blue Cronus yoga pants with size 33"
+ },
+ "intent": "5 blue Cronus yoga pants with size 33 arrived, update the stock",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "5"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/872/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][is_in_stock]\"').value",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 241
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 769,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "We've received {{quantity}} {{product}}, please update the inventory.",
+ "instantiation_dict": {
+ "quantity": "378",
+ "product": "brown Aero daily fitness tee in every size"
+ },
+ "intent": "We've received 378 brown Aero daily fitness tee in every size, please update the inventory.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/544/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "478"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/547/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "478"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/550/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "478"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/553/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "478"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/556/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "478"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 241
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 770,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "We've received {{quantity}}, update the inventory.",
+ "instantiation_dict": {
+ "quantity": "12 white Cora parachute pant of size 28 and 56 blue of size 29"
+ },
+ "intent": "We've received 12 white Cora parachute pant of size 28 and 56 blue of size 29, update the inventory.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1836/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "112"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1838/",
+ "locator": "document.querySelector('[name=\"product[quantity_and_stock_status][qty]\"').value",
+ "required_contents": {
+ "exact_match": "156"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 241
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 771,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Approve the positive reviews to display in our store.",
+ "instantiation_dict": {},
+ "intent": "Approve the positive reviews to display in our store.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/352",
+ "locator": "document.querySelector('[name=\"status_id\"').value",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/349",
+ "locator": "document.querySelector('[name=\"status_id\"').value",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/347",
+ "locator": "document.querySelector('[name=\"status_id\"').value",
+ "required_contents": {
+ "exact_match": "1"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 243
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 772,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "pending negative reviews for Circe fleece"
+ },
+ "intent": "Delete all pending negative reviews for Circe fleece",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/999",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 773,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "pending negative reviews"
+ },
+ "intent": "Delete all pending negative reviews",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/351",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/353",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 774,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "pending reviews with less than 4 stars"
+ },
+ "intent": "Delete all pending reviews with less than 4 stars",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/351",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/353",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/349",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 775,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "reviews from the scammer Arden"
+ },
+ "intent": "Delete all reviews from the scammer Arden",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/51",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 776,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "reviews from the scammer Carlo"
+ },
+ "intent": "Delete all reviews from the scammer Carlo",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/93",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/review/product/edit/id/109",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Rating isn't Available"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 777,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "$5",
+ "action": "Reduce",
+ "config": "green Hollister backyard sweater in all size"
+ },
+ "intent": "Reduce the price of green Hollister backyard sweater in all size by $5",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/120/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "47.00"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/117/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "47.00"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/114/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "47.00"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/111/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "47.00"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/123/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "47.00"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 778,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "13.5%",
+ "action": "Reduce",
+ "config": "size 28 Sahara leggings"
+ },
+ "intent": "Reduce the price of size 28 Sahara leggings by 13.5%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1841/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "64.88"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1842/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "64.88"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1843/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "64.88"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 779,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "15%",
+ "action": "Reduce",
+ "config": "yellow shirts from Gwyn Endurance in all size below L"
+ },
+ "intent": "Reduce the price of yellow shirts from Gwyn Endurance in all size below L by 15%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1559/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "20.40"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1562/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "20.40"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1565/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "20.40"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 780,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1481/",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "$17",
+ "action": "Increase",
+ "config": "white Ingrid Running with size L and above"
+ },
+ "intent": "Increase the price of white Ingrid Running with size L and above by $17",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1264/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "64.00"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1267/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "64.00"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 781,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "37%",
+ "action": "Increase",
+ "config": "black fitness tshirts from Desiree with size XS"
+ },
+ "intent": "Increase the price of black fitness tshirts from Desiree with size XS by 37%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/1573/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "32.88"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 782,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "{{action}} the price of {{config}} by {{amount}}",
+ "instantiation_dict": {
+ "amount": "23%",
+ "action": "Increase",
+ "config": "all blue running tshirts in extra small and small sizes"
+ },
+ "intent": "Increase the price of all blue running tshirts in extra small and small sizes by 23%",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/496/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "22.33"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/499/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "22.33"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/479/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "21.56"
+ }
+ },
+ {
+ "url": "__SHOPPING_ADMIN__/catalog/product/edit/id/482/",
+ "locator": "document.querySelector('[name=\"product[price]\"').value",
+ "required_contents": {
+ "exact_match": "21.56"
+ }
+ }
+ ]
+ },
+ "intent_template_id": 742
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 783,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Add the following users to repo {{repo}} as {{role}}: {{user_list}}",
+ "instantiation_dict": {
+ "repo": "kkroening/ffmpeg-python",
+ "role": "maintainer",
+ "user_list": [
+ "yjlou",
+ "a11yproject"
+ ]
+ },
+ "intent": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The user is not the owner of the repo, so the user does not have permissions to add maintainer"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 351
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 784,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}",
+ "instantiation_dict": {
+ "branch_name": "main",
+ "attribute": "email address"
+ },
+ "intent": "Tell me the email address of the contributor who has the most commits to branch main",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "secupwn@users.noreply.github.com"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "secupwn@users.noreply.github.com"
+ },
+ "intent_template_id": 316
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 785,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/CellularPrivacy/Android-IMSI-Catcher-Detector",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}",
+ "instantiation_dict": {
+ "branch_name": "gh-page",
+ "attribute": "email address"
+ },
+ "intent": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "secupwn@users.noreply.github.com"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "secupwn@users.noreply.github.com"
+ },
+ "intent_template_id": 316
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 786,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/vinta/awesome-python",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}",
+ "instantiation_dict": {
+ "branch_name": "main",
+ "attribute": "number of commits"
+ },
+ "intent": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "412"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "412"
+ },
+ "intent_template_id": 316
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 787,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/dehenne/awesome-visibility/-/graphs/master",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}",
+ "instantiation_dict": {
+ "branch_name": "main",
+ "attribute": "number of followers"
+ },
+ "intent": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_exact_match": "0"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "0"
+ },
+ "intent_template_id": 316
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 788,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/amwhalen/archive-my-tweets/-/tree/github/fork/chtitux/addRssFeed",
+ "geolocation": null,
+ "intent_template": "Tell me the {{attribute}} of the contributor who has the most commits to branch {{branch_name}}",
+ "instantiation_dict": {
+ "branch_name": "php52",
+ "attribute": "full name, gitlab account name, location and email address"
+ },
+ "intent": "Tell me the full name, gitlab account name, location and email address of the contributor who has the most commits to branch php52",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "qa": [
+ {
+ "question": "Does the passage clearly state the full name is Andrew M. Whalen?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state the gitlab account name is amwhalen?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state the location is Massachusetts?",
+ "answer": "Yes"
+ },
+ {
+ "question": "Does the passage clearly state the email address is github@amwhalen.com?",
+ "answer": "Yes"
+ }
+ ]
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "Andrew M. Whalen, amwhalen, Massachusetts, github@amwhalen.com"
+ },
+ "intent_template_id": 316
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 789,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue {{issue}} in {{repo}}.",
+ "instantiation_dict": {
+ "repo": "huggingface dataset",
+ "issue": "asking about do they have any plan on supporting Webagent in the next quater"
+ },
+ "intent": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "huggingface repo does not exist"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 328
+ },
+ {
+ "sites": [
+ "shopping_admin"
+ ],
+ "task_id": 790,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_admin_state.json",
+ "start_url": "__SHOPPING_ADMIN__",
+ "geolocation": null,
+ "intent_template": "Delete all {{review_type}}",
+ "instantiation_dict": {
+ "review_type": "negative reviews for Sybil running short"
+ },
+ "intent": "Delete all negative reviews for Sybil running short",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "There is no negative reviews for the given product. All reviews are positive."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 246
+ },
+ {
+ "sites": [
+ "gitlab",
+ "reddit"
+ ],
+ "task_id": 791,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab.reddit_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Make a folder named {{directory}} on the {{gitlab_repo}} repo and include a file called urls.txt that consists of the links to the 5 most recent posts from {{subreddit}}.",
+ "instantiation_dict": {
+ "directory": "car",
+ "subreddit": "cars",
+ "gitlab_repo": "gimmiethat.space"
+ },
+ "intent": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "the subreddit does not exist"
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 84
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 794,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Change the delivery address for my most recent order to {{address}}.",
+ "instantiation_dict": {
+ "address": "4000 Forbes Ave, Pittsburgh, PA"
+ },
+ "intent": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The shipping address cannot be changed after an order is placed, according to the website's policy."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 191
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 795,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Change the delivery address for my most recent order to {{address}}.",
+ "instantiation_dict": {
+ "address": "6726 McPherson Blvd, Pittsburgh, PA"
+ },
+ "intent": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The shipping address cannot be changed after an order is placed in the website."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 191
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 796,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Change the delivery address for my most recent order to {{address}}.",
+ "instantiation_dict": {
+ "address": "155 5th Street, San Francisco, CA"
+ },
+ "intent": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The shipping address cannot be changed after an order is placed in the website."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 191
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 797,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Change the delivery address for my most recent order to {{address}}.",
+ "instantiation_dict": {
+ "address": "3 Oxford St, Cambridge, MA"
+ },
+ "intent": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The shipping address cannot be changed after an order is placed in the website."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 191
+ },
+ {
+ "sites": [
+ "shopping"
+ ],
+ "task_id": 798,
+ "require_login": true,
+ "storage_state": "./.auth/shopping_state.json",
+ "start_url": "__SHOPPING__",
+ "geolocation": null,
+ "intent_template": "Change the delivery address for my most recent order to {{address}}.",
+ "instantiation_dict": {
+ "address": "77 Massachusetts Ave, Cambridge, MA"
+ },
+ "intent": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "string_match"
+ ],
+ "reference_answers": {
+ "fuzzy_na_match": "The shipping address cannot be changed after an order is placed in the website."
+ },
+ "reference_url": "",
+ "program_html": [],
+ "string_note": "",
+ "reference_answer_raw_annotation": "N/A"
+ },
+ "intent_template_id": 191
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 799,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a new group \"{{name}}\" with members {{members}}",
+ "instantiation_dict": {
+ "name": "n-lab",
+ "members": "patou, egpast, westurner, jontutcher"
+ },
+ "intent": "create a new group \"n-lab\" with members patou, egpast, westurner, jontutcher",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/groups/n-lab/-/group_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@patou",
+ "@egpast",
+ "@westurner",
+ "@jontutcher"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 600
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 800,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a new group \"{{name}}\" with members {{members}}",
+ "instantiation_dict": {
+ "name": "x-lab",
+ "members": "JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY"
+ },
+ "intent": "create a new group \"x-lab\" with members JonasVautherin, dilipchandima, dawiss1337, bmyun, DCMJY",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/groups/x-lab/-/group_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@JonasVautherin",
+ "@dilipchandima",
+ "@dawiss1337",
+ "@bmyun",
+ "@DCMJY"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 600
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 801,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a new group \"{{name}}\" with members {{members}}",
+ "instantiation_dict": {
+ "name": "crew",
+ "members": "ASWATFZLLC, patrickhlauke, westurner, linkmatrix"
+ },
+ "intent": "create a new group \"crew\" with members ASWATFZLLC, patrickhlauke, westurner, linkmatrix",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/groups/crew/-/group_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@ASWATFZLLC",
+ "@patrickhlauke",
+ "@westurner",
+ "@linkmatrix"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 600
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 802,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a new group \"{{name}}\" with members {{members}}",
+ "instantiation_dict": {
+ "name": "coding_friends",
+ "members": "qhduan, Agnes-U"
+ },
+ "intent": "create a new group \"coding_friends\" with members qhduan, Agnes-U",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/groups/coding_friends/-/group_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@qhduan",
+ "@Agnes-U"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 600
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 803,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "create a new group \"{{name}}\" with members {{members}}",
+ "instantiation_dict": {
+ "name": "webagent",
+ "members": "pandey2000, sayakpaul, sayakpaul"
+ },
+ "intent": "create a new group \"webagent\" with members pandey2000, sayakpaul, sayakpaul",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/groups/webagent/-/group_members",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "@pandey2000",
+ "@sayakpaul"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 600
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 804,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Assign the issue regarding {{issue}} to {{account}}.",
+ "instantiation_dict": {
+ "issue": "flash alerts",
+ "account": "myself and primer"
+ },
+ "intent": "Assign the issue regarding flash alerts to myself and primer.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Add documentation on using Flash alerts in dialog components",
+ "Clarify usage of flash alert"
+ ]
+ }
+ },
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=primer",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Add documentation on using Flash alerts in dialog components",
+ "Clarify usage of flash alert"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 999
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 805,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/a11yproject/a11yproject.com",
+ "geolocation": null,
+ "intent_template": "Submit a merge request for {{source_branch}} to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "the branch that implements the support of template strings",
+ "target_branch": "master",
+ "reviewer": "myself and Roshan"
+ },
+ "intent": "Submit a merge request for the branch that implements the support of template strings to be merged into master branch, assign myself and Roshan as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "feature/replace-gulp"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "main"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Byte Blaze",
+ "Roshan Jossy"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 806,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "a11yproject.com/redesign",
+ "target_branch": "markdown-figure-block",
+ "reviewer": "myself"
+ },
+ "intent": "Submit a merge request for a11yproject.com/redesign branch to be merged into markdown-figure-block branch, assign myself as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "redesign"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "feature/markdown-figure-block"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Byte Blaze"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 807,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__/primer/design",
+ "geolocation": null,
+ "intent_template": "Submit a merge request for {{source_branch}} branch to be merged into {{target_branch}} branch, assign {{reviewer}} as the reviewer",
+ "instantiation_dict": {
+ "source_branch": "build time debug",
+ "target_branch": "main",
+ "reviewer": "myself"
+ },
+ "intent": "Submit a merge request for build time debug branch to be merged into main branch, assign myself as the reviewer",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/primer/design/-/merge_requests",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[1].outerText",
+ "required_contents": {
+ "exact_match": "main"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelectorAll(\".detail-page-description > a.gl-font-monospace\")[0].outerText",
+ "required_contents": {
+ "exact_match": "debug-build-time"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.reviewer').outerText",
+ "required_contents": {
+ "must_include": [
+ "Byte Blaze"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 335
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 808,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}",
+ "instantiation_dict": {
+ "repo": "cloud-to-butt",
+ "issue": "Let's keep the project alive",
+ "account": "myself",
+ "due": "the end of Q1 2033"
+ },
+ "intent": "Create an issue in cloud-to-butt repo with title \"Let's keep the project alive\". Assign the issue to myself. Set due date to be the end of Q1 2033",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/byteblaze/cloud-to-butt/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText",
+ "required_contents": {
+ "exact_match": "Let's keep the project alive"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Mar 31, 2033"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.assignee').outerText",
+ "required_contents": {
+ "must_include": [
+ "Byte Blaze"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 327
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 809,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Create an issue in {{repo}} repo with title \"{{issue}}\". Assign the issue to {{account}}. Set due date to be {{due}}",
+ "instantiation_dict": {
+ "repo": "a11yproject",
+ "issue": "404 for many URLs",
+ "account": "myself",
+ "due": "2030-1-3"
+ },
+ "intent": "Create an issue in a11yproject repo with title \"404 for many URLs\". Assign the issue to myself. Set due date to be 2030-1-3",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "url_match",
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "__GITLAB__/a11yproject/a11yproject.com/-/issues",
+ "program_html": [
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-qa-selector=\"title_content\"]').outerText",
+ "required_contents": {
+ "exact_match": "404 for many URLs"
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('[data-testid=\"sidebar-due-date\"').outerText",
+ "required_contents": {
+ "must_include": [
+ "Jan 3, 2030"
+ ]
+ }
+ },
+ {
+ "url": "last",
+ "locator": "document.querySelector('.block.assignee').outerText",
+ "required_contents": {
+ "must_include": [
+ "Byte Blaze"
+ ]
+ }
+ }
+ ],
+ "url_note": "GOLD in PRED"
+ },
+ "intent_template_id": 327
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 810,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} repo to {{account}}.",
+ "instantiation_dict": {
+ "repo": "primer design guide",
+ "issue": "flash alert",
+ "account": "myself"
+ },
+ "intent": "Assign the issue regarding flash alert in primer design guide repo to myself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "Add documentation on using Flash alerts in dialog components",
+ "Clarify usage of flash alert"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 999
+ },
+ {
+ "sites": [
+ "gitlab"
+ ],
+ "task_id": 811,
+ "require_login": true,
+ "storage_state": "./.auth/gitlab_state.json",
+ "start_url": "__GITLAB__",
+ "geolocation": null,
+ "intent_template": "Assign the issue regarding {{issue}} in {{repo}} to {{account}}.",
+ "instantiation_dict": {
+ "repo": "a11yproject",
+ "issue": 404,
+ "account": "myself"
+ },
+ "intent": "Assign the issue regarding 404 in a11yproject to myself.",
+ "require_reset": false,
+ "eval": {
+ "eval_types": [
+ "program_html"
+ ],
+ "reference_answers": null,
+ "reference_url": "",
+ "program_html": [
+ {
+ "url": "__GITLAB__/dashboard/issues?scope=all&state=opened&assignee_username=byteblaze",
+ "locator": "",
+ "required_contents": {
+ "must_include": [
+ "404s, bad host, timeouts, bad urls for URLs linked from website"
+ ]
+ }
+ }
+ ]
+ },
+ "intent_template_id": 999
+ }
+]
\ No newline at end of file
diff --git a/environment_docker/README.md b/environment_docker/README.md
index ddf25d3..eb6ecee 100644
--- a/environment_docker/README.md
+++ b/environment_docker/README.md
@@ -1,6 +1,20 @@
-# Docker for WebArena Websites
+# Docker for WebArena and VisualWebArena Websites
This REAME file host the instructions for our Docker images and quick start guide for starting up websites used in VisualWebArena.
+# Table of Content
+- [Pre-installed Amazon Machine Image (Recommended)](#pre-installed-amazon-machine-image-recommended)
+ * [Environment reset](#environment-reset)
+- [Individual Wewbsite Setup](#individual-wewbsite-setup)
+ * [[VWA] Classifieds Website](#vwa-classifieds-website)
+ * [[Both] Shopping Website (OneStopShop)](#both-shopping-website-onestopshop)
+ * [[WA] E-commerce Content Management System (CMS)](#wa-e-commerce-content-management-system-cms)
+ * [[Both] Social Forum Website (Reddit)](#both-social-forum-website-reddit)
+ * [[WA] Gitlab Website](#wa-gitlab-website)
+ * [[Both] Wikipedia Website](#both-wikipedia-website)
+ * [[Both] Homepage](#both-homepage)
+ * [[WA] Map](#wa-map)
+ * [[Both] Documentation sites](#both-documentation-sites)
+
## Pre-installed Amazon Machine Image (Recommended)
We provide an AMI which has the environments for **both** VisualWebArena and WebArena websites pre-installed. You can use the AMI to start a new EC2 instance:
@@ -74,11 +88,13 @@ However, if you are unable to use AWS AMI, read below to set up on your own mach
### Environment reset
After evaluating the examples, reset the environment to the initial state
```bash
-### For VWA:
+### For VisualWebArena:
bash scripts/reset_reddit.sh
bash reset_shopping.sh
curl -X POST http://:9980/index.php?page=reset -d "token=4b61655535e7ed388f0d40a93600254c"
+```
+```bash
### For WebArena:
docker stop shopping_admin gitlab
docker remove shopping_admin gitlab
@@ -87,7 +103,10 @@ docker run --name gitlab -d -p 8023:8023 gitlab-populated-final-port8023 /opt/gi
```
-## Classifieds Website
+## Individual Wewbsite Setup
+We highly recommend setting up the environments with AMI introduced above, but we also list the steps to setting up individual websites below. This allows you to setup selected websites *locally*.
+
+### [VWA] Classifieds Website
Download the image zip from one of the following:
- https://drive.google.com/file/d/1m79lp84yXfqdTBHr6IS7_1KkL4sDSemR/view
@@ -104,9 +123,9 @@ docker exec classifieds_db mysql -u root -ppassword osclass -e 'source docker-en
Now you can visit `http://:9980`.
-## Shopping Website (OneStopShop)
+### [Both] Shopping Website (OneStopShop)
-The Shopping Website follows the same setup as the same environment used in WebArena. Download the image tar from the following mirrors:
+Download the image tar from the following mirrors:
- https://drive.google.com/file/d/1gxXalk9O0p9eu1YkIJcmZta1nvvyAJpA/view?usp=sharing
- https://archive.org/download/webarena-env-shopping-image
- http://metis.lti.cs.cmu.edu/webarena-images/shopping_final_0712.tar
@@ -119,28 +138,34 @@ docker run --name shopping -p 7770:80 -d shopping_final_0712
docker exec shopping /var/www/magento2/bin/magento setup:store-config:set --base-url="http://:7770" # no trailing slash
docker exec shopping mysql -u magentouser -pMyPassword magentodb -e 'UPDATE core_config_data SET value="http://:7770/" WHERE path = "web/secure/base_url";'
docker exec shopping /var/www/magento2/bin/magento cache:flush
-
-# Disable re-indexing of products
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalogrule_product
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalogrule_rule
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalogsearch_fulltext
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalog_category_product
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule customer_grid
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule design_config_grid
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule inventory
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalog_product_category
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalog_product_attribute
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule catalog_product_price
-docker exec shopping /var/www/magento2/bin/magento indexer:set-mode schedule cataloginventory_stock
```
Now you can visit `http://:7770`.
-## Social Forum Website (Reddit)
+### [WA] E-commerce Content Management System (CMS)
+
+Download the image tar from the following mirrors:
+- https://drive.google.com/file/d/1See0ZhJRw0WTTL9y8hFlgaduwPZ_nGfd/view?usp=sharing
+- https://archive.org/download/webarena-env-shopping-admin-image
+- http://metis.lti.cs.cmu.edu/webarena-images/shopping_admin_final_0719.tar
-The Wikipedia Website follows the same setup procedure as the environment used in WebArena. Download the image tar from the following mirrors:
+```
+docker load --input shopping_admin_final_0719.tar
+docker run --name shopping_admin -p 7780:80 -d shopping_admin_final_0719
+# wait ~1 min to wait all services to start
+
+docker exec shopping_admin /var/www/magento2/bin/magento setup:store-config:set --base-url="http://:7780" # no trailing slash
+docker exec shopping_admin mysql -u magentouser -pMyPassword magentodb -e 'UPDATE core_config_data SET value="http://:7780/" WHERE path = "web/secure/base_url";'
+docker exec shopping_admin /var/www/magento2/bin/magento cache:flush
+```
+Now you can visit `http://:7780/admin`.
+
+
+### [Both] Social Forum Website (Reddit)
+
+Download the image tar from the following mirrors:
- https://drive.google.com/file/d/17Qpp1iu_mPqzgO_73Z9BnFjHrzmX9DGf/view?usp=sharing
-- https://archive.org/download/postmill-populated-exposed-withimg
+- https://archive.org/download/webarena-env-forum-image
- http://metis.lti.cs.cmu.edu/webarena-images/postmill-populated-exposed-withimg.tar
```
@@ -150,9 +175,26 @@ docker run --name forum -p 9999:80 -d postmill-populated-exposed-withimg
Now you can visit `http://:9999/`.
-## Wikipedia Website
+### [WA] Gitlab Website
+
+Download the image tar from the following mirrors:
+- https://drive.google.com/file/d/19W8qM0DPyRvWCLyQe0qtnCWAHGruolMR/view?usp=sharing
+- https://archive.org/download/webarena-env-gitlab-image
+- http://metis.lti.cs.cmu.edu/webarena-images/gitlab-populated-final-port8023.tar
+
+```
+docker load --input gitlab-populated-final-port8023.tar
+docker run --name gitlab -d -p 8023:8023 gitlab-populated-final-port8023 /opt/gitlab/embedded/bin/runsvdir-start
+
+# wait at least 5 mins for services to boot
+docker exec gitlab sed -i "s|^external_url.*|external_url 'http://:8023'|" /etc/gitlab/gitlab.rb
+docker exec gitlab gitlab-ctl reconfigure
+```
+It might take 5 mins to start and then you can visit `http://:8023/explore`.
+
+### [Both] Wikipedia Website
-The Wikipedia Website follows the same setup procedure as the environment used in WebArena. Download the data from the following mirrors:
+Download the data from the following mirrors:
- https://drive.google.com/file/d/1Um4QLxi_bGv5bP6kt83Ke0lNjuV9Tm0P/view?usp=sharing
- https://archive.org/download/webarena-env-wiki-image
- http://metis.lti.cs.cmu.edu/webarena-images/wikipedia_en_all_maxi_2022-05.zim
@@ -162,13 +204,12 @@ docker run -d --name=wikipedia --volume=/:/data
```
Now you can visit `http://:8888/wikipedia_en_all_maxi_2022-05/A/User:The_other_Kiwix_guy/Landing`.
-
-## Homepage
+### [Both] Homepage
The homepage lists all available websites which the agent can use to navigate to different sites.

-To host the homepage, first change `` to the corresponding server hostnames in [webarena-homepage/templates/index.html](webarena-homepage/templates/index.html)
+To host the homepage, first change `` to the corresponding server hostnames in [webarena_homepage/templates/index.html](webarena-homepage/templates/index.html)
```bash
# Define your actual server hostname
YOUR_ACTUAL_HOSTNAME=""
@@ -180,7 +221,13 @@ perl -pi -e "s||${YOUR_ACTUAL_HOSTNAME}|g" webarena-homepa
Then run
```
-cd webarena_homepage
+cd webarena-homepage
flask run --host=0.0.0.0 --port=4399
```
The homepage will be available at `http://:4399`.
+
+### [WA] Map
+Please refer to the AMI setup for the map.
+
+### [Both] Documentation sites
+We are still working on dockerizing the documentation sites. As they are read-only sites and they usually don't change rapidly. It is safe to use their live sites for test purpose right now.
diff --git a/evaluation_harness/__init__.py b/evaluation_harness/__init__.py
index fd0b27d..90a4075 100644
--- a/evaluation_harness/__init__.py
+++ b/evaluation_harness/__init__.py
@@ -16,4 +16,5 @@
shopping_get_sku_latest_review_author,
shopping_get_sku_latest_review_rating,
shopping_get_sku_latest_review_text,
+ PseudoPage
)
diff --git a/evaluation_harness/env_config.py b/evaluation_harness/env_config.py
new file mode 100644
index 0000000..26e1ae7
--- /dev/null
+++ b/evaluation_harness/env_config.py
@@ -0,0 +1,81 @@
+# websites domain
+import os
+
+DATASET = os.environ["DATASET"]
+if DATASET not in ["webarena", "visualwebarena"]:
+ raise ValueError(
+ "Please set the DATASET environment variable, the possible options are `webarena`, `visualwebarena` and `miniwob++`"
+ )
+
+# WebArena
+if DATASET == "webarena":
+ REDDIT = os.environ.get("REDDIT", "")
+ SHOPPING = os.environ.get("SHOPPING", "")
+ SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "")
+ GITLAB = os.environ.get("GITLAB", "")
+ WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
+ MAP = os.environ.get("MAP", "")
+ HOMEPAGE = os.environ.get("HOMEPAGE", "")
+ assert (
+ REDDIT
+ and SHOPPING
+ and SHOPPING_ADMIN
+ and GITLAB
+ and WIKIPEDIA
+ and MAP
+ and HOMEPAGE
+ ), (
+ f"Please setup the URLs to each site. Current: \n"
+ + f"Reddit: {REDDIT}\n"
+ + f"Shopping: {SHOPPING}\n"
+ + f"Shopping Admin: {SHOPPING_ADMIN}\n"
+ + f"Gitlab: {GITLAB}\n"
+ + f"Wikipedia: {WIKIPEDIA}\n"
+ + f"Map: {MAP}\n"
+ + f"Homepage: {HOMEPAGE}\n"
+ )
+
+elif DATASET == "visualwebarena":
+ REDDIT = os.environ.get("REDDIT", "")
+ SHOPPING = os.environ.get("SHOPPING", "")
+ WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
+ HOMEPAGE = os.environ.get("HOMEPAGE", "")
+ CLASSIFIEDS = os.environ.get("CLASSIFIEDS", "")
+ CLASSIFIEDS_RESET_TOKEN = os.environ.get("CLASSIFIEDS_RESET_TOKEN", "")
+ REDDIT_RESET_URL = os.environ.get("REDDIT_RESET_URL", "")
+
+ assert (
+ REDDIT
+ and SHOPPING
+ and WIKIPEDIA
+ and HOMEPAGE
+ and CLASSIFIEDS
+ and CLASSIFIEDS_RESET_TOKEN
+ ), (
+ f"Please setup the URLs and tokens to each site. Current: "
+ + f"Reddit: {REDDIT}"
+ + f"Shopping: {SHOPPING}"
+ + f"Wikipedia: {WIKIPEDIA}"
+ + f"Homepage: {HOMEPAGE}"
+ + f"Classifieds: {CLASSIFIEDS}"
+ + f"Classifieds reset token: {CLASSIFIEDS_RESET_TOKEN}"
+ )
+
+else:
+ raise ValueError(f"Dataset not implemented: {DATASET}")
+
+
+ACCOUNTS = {
+ "reddit": {"username": "MarvelsGrantMan136", "password": "test1234"},
+ "shopping": {
+ "username": "emma.lopez@gmail.com",
+ "password": "Password.123",
+ },
+ "classifieds": {
+ "username": "blake.sullivan@gmail.com",
+ "password": "Password.123",
+ },
+ "shopping_site_admin": {"username": "admin", "password": "admin1234"},
+ "shopping_admin": {"username": "admin", "password": "admin1234"},
+ "gitlab": {"username": "byteblaze", "password": "hello1234"},
+}
diff --git a/evaluation_harness/eval_evaluators/fuzzy_match_dataset.json b/evaluation_harness/eval_evaluators/fuzzy_match_dataset.json
new file mode 100644
index 0000000..a0212cd
--- /dev/null
+++ b/evaluation_harness/eval_evaluators/fuzzy_match_dataset.json
@@ -0,0 +1,17873 @@
+{
+ "fuzzy_must_include": [
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": true,
+ "prediction": "Order number 170 is canceled, order number 189 is pending"
+ },
+ {
+ "task": "Provide the current statuses for tickets 101 and 202.",
+ "reference": [
+ "101: resolved",
+ "202: in progress"
+ ],
+ "label": false,
+ "plan": "I will change the status of ticket 101 and keep the status of ticket 202 the same",
+ "prediction": "The current statuses for the tickets are as follows: Ticket 101 is open, and Ticket 202 is in progress."
+ },
+ {
+ "task": "Show me how many orders are placed each month over the past 7 months",
+ "reference": [
+ "January: 10",
+ "February: 15",
+ "March: 20",
+ "April: 25",
+ "May: 30",
+ "June: 35",
+ "July: 40"
+ ],
+ "label": true,
+ "prediction": "Here are the number of orders placed each month over the past 7 months: In January, 10 orders were placed. In February, 15 orders were placed. In March, 20 orders were placed. In April, 25 orders were placed. In May, 30 orders were placed. In June, 35 orders were placed. In July, 40 orders were placed."
+ },
+ {
+ "task": "Summarize the sales revenue for each quarter of the past year",
+ "reference": [
+ "Q1: $58,900",
+ "Q2: $60,200",
+ "Q3: $70,001",
+ "Q4: $81,300"
+ ],
+ "label": false,
+ "plan": "I will change the revenue of Q2 and simulate a scenario where the revenue of Q4 cannot be found",
+ "prediction": "I found the sales revenue summary for the past year: In Q1, the revenue was $58,900. In Q2, the revenue was $60230. In Q3, the revenue was $70,001. Unfortunately I cannot find the revenue for Q4."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": false,
+ "plan": "I will change one of the brands and keep the other the same",
+ "prediction": "The brands of the products that have 3 units left are Eos and Athena."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change the capitalization of 'nike' and alter one brand name",
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister, Joust, and Nike."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Olivia zip jacket are Emma Lopez and Seam Miller."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": false,
+ "plan": "I will omit one state and add an incorrect one",
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, Delaware, and Virginia."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got the least stars are: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": false,
+ "plan": "I will change the distance for The Oaklander Hotel and keep the distance for Wyndham Pittsburgh University Center the same",
+ "prediction": "The walking distance from nearby hotels to Gardner Steel Conference Center that take at most 5 minutes are: Wyndham Pittsburgh University Center at 375 meters, and The Oaklander Hotel at 400 meters."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": false,
+ "plan": "I will omit Rhode Island from the list of states that border Connecticut",
+ "prediction": "The US states that border Connecticut are Massachusetts and New York."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Olivia zip jacket are Emma Lopez and Seam Miller."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": true,
+ "prediction": "The customer who has completed the fifth most number of orders in the entire history is Jane Doe, followed closely by Matt Baker."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will omit Amazon Customer and change Bethany Robertson to Beth Robertson",
+ "prediction": "The customers who gave 4 or 5 stars for phone cases from EYZUTAK are MH, Misba009, Amanda, Cally, and Beth Robertson."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": false,
+ "plan": "I will change Michael Nguyen to Michael Smith and omit Sarah Miller",
+ "prediction": "The customers who have completed the second most number of orders in the entire history are Adam Garcia and Michael Smith."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": false,
+ "plan": "I will change the size of Eos V-Neck Hoodie to M and keep Minera Luma Tech V-Tee the same",
+ "prediction": "The products that have 2-3 units left are: Eos V-Neck Hoodie in size M and Minera Luma Tech V-Tee in size XS."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": true,
+ "prediction": "The walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes, are as follows: Wyndham Pittsburgh University Center is 375 meters away, and The Oaklander Hotel is 338 meters away."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": true,
+ "prediction": "The time comparison for the routes from AMC Waterfront to Carnegie Mellon University is as follows: Driving takes approximately 13 minutes, while walking takes around 1 hour and 35 minutes."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": true,
+ "prediction": "The customers who have completed the fifth most number of orders in the entire history are Jane Doe and Matt Baker."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": false,
+ "plan": "I will alter the product name of the first item and keep the second item the same.",
+ "prediction": "The product names for order number 148 are: Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 3' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers White."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": true,
+ "prediction": "The walking distances from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes, are as follows: Wyndham Pittsburgh University Center is 375 meters away, and The Oaklander Hotel is 338 meters away."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will omit 'N Randall' from the list of customer names",
+ "prediction": "The customers who have complained about the quality of EYZUTAK phone cases are Lisa Lee, Evelyn Kurver, and Amanda."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": true,
+ "prediction": "Upon arriving at Pittsburgh Airport, you will find the Hyatt Regency Pittsburgh International Airport nearby. The supermarkets within a 15-minute drive from the hotel include Giant Eagle and ALDI."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": false,
+ "plan": "I will change 'Delaware' to 'Virginia' in the list of states",
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, Virginia, and West Virginia."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": false,
+ "plan": "I will omit one state from the list.",
+ "prediction": "The US states that border New Hampshire are Massachusetts and Vermont."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": true,
+ "prediction": "The top-1 best-selling products in 2022 pride month, including any ties at this rank, are the Harmony Lumaflex\u2122 Strength Band Kit and the Affirm Water Bottle."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": false,
+ "plan": "I will alter the time it takes to walk.",
+ "prediction": "From CMU Pittsburgh, it takes 2 minutes to walk to the nearest USPS postal office, less than 1 minute to drive, and less than 1 minute to bike."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": false,
+ "plan": "I will add an extra criticism.",
+ "prediction": "The main criticisms of this product are that it is dry, has uneven color, and is too expensive."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": false,
+ "plan": "I will change the SKU of one product and keep the other the same",
+ "prediction": "The SKUs of the products with 1-3 units left are WH11-S-Blue and WS08-XS-Green."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": true,
+ "prediction": "The top-2 best-selling products in 2022, including ties, are: Quest Lumaflex\u2122 Band, Cruise Dual Analog Watch, Sprite Stasis Ball 65 cm, and Sprite Stasis Ball 55 cm."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": false,
+ "plan": "I will omit one customer from the list",
+ "prediction": "The customers who have completed the second most number of orders in the entire history are Adam Garcia and Michael Nguyen."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will change one customer name and keep the rest the same",
+ "prediction": "The customers who think EYZUTAK phone cases are good looking include Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, and Bethany Robinson."
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": false,
+ "plan": "I will change the price of Rapha Sports Short and keep the other prices the same",
+ "prediction": "Rapha Sports Short: $30, Thorpe Track Pant: $54.4, Mach Street Sweatshirt: $62"
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": true,
+ "prediction": "The closest restaurants to the university center at Carnegie Mellon University include El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, and Au Bon Pain."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": true,
+ "prediction": "The time for driving from 5000 Fifth Avenue, Pittsburgh to UPMC Family Health Center is 2 minutes, whereas walking takes 16 minutes."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned complaints about the customer service are Bob in Vegas and RemyR."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": true,
+ "prediction": "The reviewers who have mentioned complaints about customer service are Bob in Vegas and RemyR."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change 'nike' to 'Adidas' and keep 'Hollister' and 'Joust' the same",
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister, Joust, and Adidas."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": false,
+ "plan": "I will change 'hollister' to 'Nike Shoes' and keep 'Joust Bag' the same",
+ "prediction": "The top 2 most searched terms in your store are Nike Shoes and Joust Bag."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": true,
+ "prediction": "The time it takes to travel from Randyland to Carnegie Mellon University is 13 minutes by driving and 1 hour and 45 minutes by walking."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": true,
+ "prediction": "January: 12 orders, February: 7 orders, March: 5 orders, April: 9 orders, May: 5 orders"
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": true,
+ "prediction": "From January to March 2023, the spending at One Stop Market was as follows: Jan: $572.80, Feb: $762.18, Mar: $83.31."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": true,
+ "prediction": "The top-1 best-selling products in January 2023, including any ties, are Impulse Duffle, Overnight Duffle, and Hawkeye Yoga Short."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": false,
+ "plan": "I will omit one of the brands from the list",
+ "prediction": "The brand of the products that have 3 units left is Eos."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": true,
+ "prediction": "The driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center takes 2 minutes, while the walking route takes 16 minutes."
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": false,
+ "plan": "I will omit one name from the list",
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Williams, and Brian Smith."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Olivia zip jacket are Emma Lopez and Seam Miller."
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": false,
+ "plan": "I will change one name in the list",
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Williams, Brian Smith, and Alexander Thompson."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time while keeping the driving time the same",
+ "prediction": "The time difference for routes from Randyland to Carnegie Mellon University is as follows: driving takes 13 minutes, and walking takes 1 hour and 30 minutes."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": false,
+ "plan": "I will change the name of the first repository",
+ "prediction": "The repositories where you made contributions and they got the most stars are: accessibilityproject.com and design."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": true,
+ "prediction": "The brands of the products that have 3 units left are Eos and Minerva."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": false,
+ "plan": "I will omit the address of the second airport",
+ "prediction": "The full address of the US international airport within a driving distance of 60 km to Niagara Falls is Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "The monthly count of successful orders from Feb to Nov 2022 are as follows: February: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Antonia racer tank are Shaunte and Merrie."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The top 3 most searched terms in your store are: hollister, Joust Bag, and nike."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": true,
+ "prediction": "The US states that border Vermont are New York, New Hampshire, and Massachusetts."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": true,
+ "prediction": "The time for driving from AMC Waterfront to the University of Pittsburgh is 15 minutes, while walking takes 1 hour and 47 minutes."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": false,
+ "plan": "I will change the name of one of the supermarkets and omit one of them.",
+ "prediction": "The nearest Hyatt hotel to Pittsburgh Airport is the Hyatt Regency Pittsburgh International Airport. Within a 15-minute drive from the hotel, you can find a Giant Eagle and a Trader Joe's."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": true,
+ "prediction": "From CMU Pittsburgh, it takes 1 minute to walk to the nearest USPS postal office. If you drive or bike, it will take less than 1 minute."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": true,
+ "prediction": "If you are at CMU Pittsburgh, the nearest USPS postal office is just 1 minute away by walking. Driving or biking will take you less than a minute."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": true,
+ "prediction": "The US states that border Connecticut are Rhode Island, Massachusetts, and New York."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": true,
+ "prediction": "The driving route from Randyland to Carnegie Mellon University takes 13 minutes, while the walking route takes 1 hour and 45 minutes."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got the least stars are: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": false,
+ "plan": "I will omit 'Back Bar Grill' to miss one element from the list.",
+ "prediction": "The closest restaurants to the university center at Carnegie Mellon University are El Gallo de Oro, Grano, Beefsteak, Nourish, Schatz Dining Room, and Au Bon Pain."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will omit the name 'N Randall' from the list",
+ "prediction": "The customers who complained about the quality of EYZUTAK phone cases are Lisa Lee, Evelyn Kurver, and Amanda."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": true,
+ "prediction": "The book names from the top 10 posts in the \"books\" forum that recommend a single book are 'A Christmas Carol' and 'The Hobbit'."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count for February and misspell 'October' as 'Octobor'",
+ "prediction": "Here is the monthly count of successful orders from January to December 2022 in MOUNT:COUNT format: January: 11 orders, Feburary: 18 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octobor: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will add an additional name 'Amanda' to the list",
+ "prediction": "The customers who gave 1 or 2 stars for EYZUTAK phone cases are Evelyn Kurver, N Randall, and Amanda."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "January: 11 orders, Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders, December: 10 orders"
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned that the ear cups are small include Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, and Michelle Davis."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": true,
+ "prediction": "The brands of the products that have 3 units left are Eos and Minerva."
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": false,
+ "plan": "I will omit New York from the list of states that border Massachusetts.",
+ "prediction": "The US states that border Massachusetts are Rhode Island, Connecticut, New Hampshire, and Vermont."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": true,
+ "prediction": "The time for driving from Carnegie Science Center to Carnegie Mellon University is 12 minutes, while walking takes 1 hour and 44 minutes."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": false,
+ "plan": "I will omit the 'Sprite Stasis Ball 55 cm' from the list.",
+ "prediction": "The top-2 best-selling products in 2022 are the Quest Lumaflex\u2122 Band, the Cruise Dual Analog Watch, and the Sprite Stasis Ball 65 cm."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": false,
+ "plan": "I will change the driving time and keep the walking time the same.",
+ "prediction": "The time for driving from AMC Waterfront to the University of Pittsburgh is 20 minutes, while walking takes 1 hour and 47 minutes."
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": true,
+ "prediction": "The US states that border Massachusetts are Rhode Island, Connecticut, New York, New Hampshire, and Vermont."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": true,
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, Delaware, and West Virginia."
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": true,
+ "prediction": "Massachusetts is bordered by the following US states: Rhode Island, Connecticut, New York, New Hampshire, and Vermont."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": false,
+ "plan": "I will omit one name from the list and change one name to introduce an error",
+ "prediction": "The customers who have expressed dissatisfaction with tanks products include Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, and Teofilo."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": true,
+ "prediction": "The URLs of the posts in the 'books' forum that recommend a single book are: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol and http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": false,
+ "plan": "I will include 'A Christmas Carol' and miss 'The Hobbit'",
+ "prediction": "Among the top 10 posts in the 'books' forum, one of the posts recommends 'A Christmas Carol'."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": false,
+ "plan": "I will change the first criticism and omit the last criticism",
+ "prediction": "I ordered the 16gb but I only got 12 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before. The media could not be loaded."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are that it is dry and has an uneven color."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same",
+ "prediction": "The time for the driving route from AMC Waterfront to Carnegie Mellon University is 13 minutes, while the walking route takes 1 hour and 20 minutes."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are that it is dry and has an uneven color."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will omit a few customer names and add an extra one.",
+ "prediction": "The customers who think EYZUTAK phone cases are good looking include Lisa Lee, MH, Amanda, and an additional customer named John Doe."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": false,
+ "plan": "I will change the name of one customer and omit another one.",
+ "prediction": "The customers who have expressed dissatisfaction with tanks products include Alexander, Carma, Dominic, Monroe, Scotty, Shaunte, Teofila, and Valerie."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The full names of the repositories where you made contributions and they got less than 5 stars are: a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": true,
+ "prediction": "The driving route from AMC Waterfront to Carnegie Mellon University takes 13 minutes, whereas walking takes 1 hour and 35 minutes."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'remove-board-movement-events-from-the-github-issue-timeline' and change 'timeit' to 'timekeeper'.",
+ "prediction": "The repositories where you made contributions and they got no stars are: cloud-to-butt, dotfiles, timekeeper, solarized-prism-theme, and gimmiethat.space."
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": false,
+ "plan": "I will change the price of 'Thorpe Track Pant' and omit 'Mach Street Sweatshirt'.",
+ "prediction": "The most recent completed order includes: Rapha Sports Short for $35 and Thorpe Track Pant for $50."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": true,
+ "prediction": "For order number 148, the products are: Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers White."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will change the name of 'remove-board-movement-events-from-the-github-issue-timeline' and keep the others the same",
+ "prediction": "The repositories are cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-issue-timeline-events-from-github."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change the term 'Joust Bag' to 'Joust Backpack' and keep the others the same",
+ "prediction": "The top 3 most searched terms in your store are hollister, Joust Backpack, and nike."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned complaints about customer service are Bob in Vegas and RemyR."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count for March and misspell 'November'",
+ "prediction": "January: 11 orders, Feburary: 16 orders, March: 15 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, Novembr: 5 orders."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'remove-board-movement-events-from-the-github-issue-timeline' from the list",
+ "prediction": "The repositories where you made contributions and they got the least stars are: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, and gimmiethat.space."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same",
+ "prediction": "The time difference between walking and driving from Randyland to Carnegie Mellon University is significant. Driving takes about 13 minutes, while walking takes approximately 1 hour and 30 minutes."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": false,
+ "plan": "I will change 'Jane Doe' to 'John Smith' and keep 'Matt Baker' the same",
+ "prediction": "The customer who has completed the fifth most number of orders in the entire history is John Smith, followed closely by Matt Baker."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": false,
+ "plan": "I will add an extra book '1984' to the list",
+ "prediction": "Among the top 10 posts in the 'books' forum, the books recommended from posts that suggest a single book are: A Christmas Carol, The Hobbit, and 1984."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": false,
+ "plan": "I will omit one state from the list.",
+ "prediction": "The US states that border New Hampshire are Massachusetts and Vermont."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": false,
+ "plan": "I will alter the distance of one hotel and keep the other the same.",
+ "prediction": "The walking distance from nearby hotels to Gardner Steel Conference Center within 5 minutes are: Wyndham Pittsburgh University Center at 400m and The Oaklander Hotel at 338m."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": false,
+ "plan": "I will change one name in the list of reviewers.",
+ "prediction": "The reviewers who mentioned the ear cups being small are Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, and Michael Davis."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got the most stars are a11yproject.com and design."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time to 15 minutes and keep the driving time the same",
+ "prediction": "The driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center takes 2 minutes, while the walking route takes 15 minutes."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'timeit' from the list of repositories",
+ "prediction": "The repositories where you made contributions and they got no stars are: cloud-to-butt, dotfiles, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": true,
+ "prediction": "The time it takes to travel from 5000 Fifth Avenue, Pittsburgh to UPMC family health center is 2 minutes by driving and 16 minutes by walking."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The individuals who gave 1 or 2 stars for phone cases from EYZUTAK are Evelyn Kurver and N Randall."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": false,
+ "plan": "I will change the time it takes to walk to the USPS postal office.",
+ "prediction": "If you walk, it takes about 3 minutes. If you drive, it takes less than a minute. If you bike, it also takes less than a minute."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": false,
+ "plan": "I will change the name of the second repository.",
+ "prediction": "You made contributions to the following repositories which got the most stars: a11yproject.com and creative-design."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The customers who think EYZUTAK phone cases are good looking are: Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": false,
+ "plan": "I will change the status of order number 170.",
+ "prediction": "Order number 170 is pending, and order number 189 is pending."
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": true,
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Williams, Brian Smith, and Alexander Thomas."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with tanks products are Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, and Valorie."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with tanks products are Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, and Valorie."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mention about average print quality are Goldfish and Roxanne Brandon Coffey."
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": false,
+ "plan": "I will omit New York from the list of bordering states.",
+ "prediction": "The US states that border Massachusetts are Rhode Island, Connecticut, New Hampshire, and Vermont."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": false,
+ "plan": "I will change 'Impulse Duffle' to 'Impulse Backpack' and keep the other products the same.",
+ "prediction": "The top-1 best-selling products in January 2023 are Impulse Backpack, Overnight Duffle, and Hawkeye Yoga Short."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": true,
+ "prediction": "The US states that border Vermont are New York, New Hampshire, and Massachusetts."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": true,
+ "prediction": "The SKUs of the products that have 1-3 units left are WH11-S-Blue and WS08-XS-Blue."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": false,
+ "plan": "I will include 'Dry' and exclude 'Uneven color' from the prediction.",
+ "prediction": "The main criticism of this product is that it is dry."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": false,
+ "plan": "I will change the criticism about the strap and omit the criticism about the front being tight.",
+ "prediction": "The main criticisms of this product include: The 39 was too small and the 40 might be too big, the shoe rubbed against the baby toe, it might tear up feet over time, the strap is made of stiff material and is uncomfortable, and the Dansko's were similar but loosened up over time."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned good fingerprint resistance are Rachel and T. Gannon."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": false,
+ "plan": "I will list 'Impulse Duffle' and 'Overnight Duffle' but exclude 'Hawkeye Yoga Short'.",
+ "prediction": "The products ranked as the top-1 best-selling in Jan 2023 are Impulse Duffle and Overnight Duffle."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": true,
+ "prediction": "The top-1 best-selling products in 2022 pride month, including any ties, are the Harmony Lumaflex\u2122 Strength Band Kit and the Affirm Water Bottle."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": true,
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, Delaware, and West Virginia."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": false,
+ "plan": "I will omit one of the reviewers and change the name of the other.",
+ "prediction": "The reviewers who mentioned good fingerprint resistance include Rachael and T. G."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and received no stars are: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The customers who have complained about the quality of EYZUTAK phone cases are Lisa Lee, Evelyn Kurver, Amanda, and N Randall."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change the second searched term and keep the other two the same",
+ "prediction": "The top 3 most searched terms in your store are hollister, Backpack, and nike."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": true,
+ "prediction": "Order number 170 has been cancelled, while order number 189 is currently pending."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": false,
+ "plan": "I will omit the 'Affirm Water Bottle' from the list",
+ "prediction": "The product ranked as the top-1 best-selling in 2022 pride month is the Harmony Lumaflex\u2122 Strength Band Kit."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": false,
+ "plan": "I will omit T. Gannon from the list of reviewers.",
+ "prediction": "The reviewers who mentioned good fingerprint resistance include Rachel."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are as follows: The 39 was too small, and there is a concern that the 40 will be too big. One customer was very sad when the shoe rubbed up against their baby toe. Another had to return them because they knew in time it would tear up their feet. The strap is made of some really stiff leather and is painful to the heel. The front is also uncomfortably tight. Lastly, the Dansko's were similar but not as bad and loosened up over time."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The customers who think EYZUTAK phone cases are good looking are Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": false,
+ "plan": "I will change the name Michael Nguyen to Michael Smith and omit Sarah Miller.",
+ "prediction": "The customer who has completed the second most number of orders in the entire history is Adam Garcia and Michael Smith."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": false,
+ "plan": "I will change 'Joust Bag' to 'Joust Backpack'",
+ "prediction": "The top 2 most searched terms in your store are 'hollister' and 'Joust Backpack'."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": false,
+ "plan": "I will change 'Affirm Water Bottle' to 'Affirm Fitness Bottle'",
+ "prediction": "The top-1 best-selling products in 2022 pride month are 'Harmony Lumaflex\u2122 Strength Band Kit' and 'Affirm Fitness Bottle'."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": true,
+ "prediction": "The brands of the products that have 3 units left are Eos and Minerva."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "Here are the monthly counts of successful orders from January to November 2022 in MOUNT:COUNT format: January: 11 orders, Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": true,
+ "prediction": "The SKUs of the products that have 1-3 units left are WH11-S-Blue and WS08-XS-Blue."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": true,
+ "prediction": "The US states that border New Hampshire are Massachusetts, Vermont, and Maine."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with tanks products are Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, and Valorie."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": true,
+ "prediction": "The time for walking from AMC Waterfront to Carnegie Mellon University is 1 hour and 35 minutes, whereas the driving time is only 13 minutes."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": true,
+ "prediction": "The top-1 best-selling products in January 2023, including any ties, are: Impulse Duffle, Overnight Duffle, and Hawkeye Yoga Short."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": true,
+ "prediction": "The products that have 2-3 units left are: Eos V-Neck Hoodie in size S, and Minera Luma Tech V-Tee in size XS."
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": true,
+ "prediction": "From January to the end of March 2023, the spending at One Stop Market was as follows: January: $572.8, February: $762.18, and March: $83.31."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": true,
+ "prediction": "The monthly count of successful orders from January to May 2023 is as follows: January: 12 orders, February: 7 orders, March: 5 orders, April: 9 orders, and May: 5 orders."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got the most stars are a11yproject.com and design."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": false,
+ "plan": "I will omit New Hampshire from the list of states that border Vermont.",
+ "prediction": "The US states that border Vermont are New York and Massachusetts."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": false,
+ "plan": "I will change 'RemyR' to 'Remy' and keep 'Bob in Vegas' the same.",
+ "prediction": "The reviewers who mentioned complaints about the customer service are Bob in Vegas and Remy."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": true,
+ "prediction": "The walking distances from nearby hotels to Gardner Steel Conference Center, taking at most 5 minutes, are as follows: Wyndham Pittsburgh University Center is 375 meters away, and The Oaklander Hotel is 338 meters away."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": false,
+ "plan": "I will change the customer names to introduce errors",
+ "prediction": "The customer who has completed the fifth most number of orders in the entire history is John Smith, followed by Alex Johnson."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": false,
+ "plan": "I will change one SKU and omit one SKU to introduce errors",
+ "prediction": "The SKU of the products that have 1-3 units left are WH11-S-Red and WS09-M-Blue."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and omit the biking time to introduce errors",
+ "prediction": "From CMU Pittsburgh, it takes 5 minutes to walk to the nearest USPS postal office and less than 1 minute to drive there."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": false,
+ "plan": "I will omit one reviewer and change the name of the other to introduce errors",
+ "prediction": "The reviewer who mentioned average print quality is Roxanne B. Coffey."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": false,
+ "plan": "I will omit one reviewer from the list",
+ "prediction": "The reviewers who mentioned complaints about customer service include Bob in Vegas."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": false,
+ "plan": "I will change one of the URLs",
+ "prediction": "Among the top 10 posts in the \"books\" forum, the URLs recommending a single book are: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol and http://www.reddit.com/f/books/17445/i-just-finished-reading-the-lord-of-the-rings-to-my-6-year-old-daughter."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for one month and misspell a month",
+ "prediction": "From Feb to Nov 2022, the successful order counts are as follows: February: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 6 orders."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": false,
+ "plan": "I will change the name of one customer",
+ "prediction": "The customers who have expressed dissatisfaction with the Olivia zip jacket are Emma Lopez and Sam Miller."
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": false,
+ "plan": "I will omit 'Schatz Dining Room' from the list of restaurants.",
+ "prediction": "The closest restaurants to the university center at Carnegie Mellon University are El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, and Au Bon Pain."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": true,
+ "prediction": "The time for driving from Carnegie Science Center to Carnegie Mellon University is around 12 minutes, while walking will take approximately 1 hour and 44 minutes."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change 'nike' to 'Nike' and omit 'Joust' from the list.",
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister and Nike."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": false,
+ "plan": "I will change the status of order number 170 to 'shipped' and keep the status of order number 189 the same.",
+ "prediction": "Order number 170 is shipped, and order number 189 is pending."
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": false,
+ "plan": "I will change the name 'Julia Williams' to 'Julia Roberts' and keep the rest of the names the same",
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Roberts, Brian Smith, and Alexander Thomas."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": true,
+ "prediction": "The SKUs of the products that have 1-3 units left are WH11-S-Blue and WS08-XS-Blue."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time to 1h 45min and keep the driving time the same",
+ "prediction": "The time comparison for the routes from AMC Waterfront to Carnegie Mellon University are as follows: driving takes 13 minutes, while walking takes 1 hour and 45 minutes."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": true,
+ "prediction": "The product SKUs in the most recent cancelled orders of the customer with the most cancellations in history are WSH09-29-White, WSH09-28-Green, MSH11-34-Blue, and WP09-29-Purple."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": true,
+ "prediction": "The US states that border Connecticut are Rhode Island, Massachusetts, and New York."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": false,
+ "plan": "I will change the driving time and keep the walking time the same",
+ "prediction": "The time for the routes is as follows: driving takes 5 minutes, while walking takes 16 minutes."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": false,
+ "plan": "I will omit one of the criticisms and change the wording of another",
+ "prediction": "The main criticisms of this product are: The 39 was too small. I am afraid the 40 will be too big. I was very disappointed when the shoe rubbed against my baby toe. The front is also uncomfortably tight. The Dansko's were similar (not as bad) and loosened up over time."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Antonia racer tank are Shaunte and Merrie."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": false,
+ "plan": "I will omit 'Hawkeye Yoga Short' from the prediction",
+ "prediction": "The top-1 best-selling products in Jan 2023 are Impulse Duffle and Overnight Duffle."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": true,
+ "prediction": "The customers who have completed the fifth most number of orders in the entire history are Jane Doe and Matt Baker."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": true,
+ "prediction": "The product SKUs in the most recent cancelled orders of the customer with the most cancellations in history are WSH09-29-White, WSH09-28-Green, MSH11-34-Blue, and WP09-29-Purple."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": true,
+ "prediction": "The full addresses of all US international airports within a driving distance of 60 km to Niagara Falls are: Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States, and Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The users who gave 4 or 5 stars for phone cases from EYZUTAK include MH, Misba009, Amanda, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": false,
+ "plan": "I will change the name Seam Miller to Sam Miller and omit Emma Lopez",
+ "prediction": "The customers who have expressed dissatisfaction with the Olivia zip jacket include Sam Miller."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": true,
+ "prediction": "The top-1 best-selling products in 2022 pride month, including ties at this rank, are the Harmony Lumaflex\u2122 Strength Band Kit and the Affirm Water Bottle."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": false,
+ "plan": "I will change 'A Christmas Carol' to 'A Tale of Two Cities' and keep 'The Hobbit' the same",
+ "prediction": "Among the top 10 posts in the 'books' forum, the book names from posts that recommend a single book are A Tale of Two Cities and The Hobbit."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will omit one name and change one name from the reference list",
+ "prediction": "The customers who complained about the quality of EYZUTAK phone cases are Lisa Lee, Amanda, and Evelyn Kurver. Additionally, a customer named John Doe also raised a concern."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned the average print quality are Goldfish and Roxanne Brandon Coffey."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will omit two names and change one name from the reference list",
+ "prediction": "Customers who think EYZUTAK phone cases are good looking include Lisa Lee, Misba009, Amanda, N Randall, and an Amazon Customer. Additionally, a customer named John Doe also praised the design."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned the good fingerprint resistance are Rachel and T. Gannon."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned that the ear cups are small include Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, and Michelle Davis."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": false,
+ "plan": "I will change the size of the Eos V-Neck Hoodie and keep the Minera Luma Tech V-Tee the same",
+ "prediction": "The products that have 2-3 units left are Eos V-Neck Hoodie in size M and Minera Luma Tech V-Tee in size XS."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'gimmiethat.space' and change 'cloud-to-butt' to 'cloud-to-butt-plus'",
+ "prediction": "The repositories where you made contributions and they got less than 5 stars are: a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt-plus, dotfiles, timeit, solarized-prism-theme, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": true,
+ "prediction": "The top 2 most searched terms in your store are hollister and Joust Bag."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": true,
+ "prediction": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging. It's used and the previous owner's data has not been erased. The product is legit Sony hardware that has been owned by someone else before. The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA."
+ },
+ {
+ "task": "Which customer has completed the fifth most number of orders in the entire history?",
+ "reference": [
+ "Jane Doe",
+ "Matt Baker"
+ ],
+ "label": false,
+ "plan": "I will change the name of one of the customers and keep the other name the same",
+ "prediction": "The customer who has completed the fifth most number of orders in the entire history is Jane Doe. Another notable customer is John Smith."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for two months to create discrepancies",
+ "prediction": "Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 12 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 3 orders, November: 5 orders"
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": false,
+ "plan": "I will omit one reviewer and add an extra name that is not in the reference",
+ "prediction": "The reviewers who mentioned that the ear cups are small include Joseph Brzezinski, Catso, Dibbins, and Michelle Davis. Additionally, John Doe also mentioned this issue."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will omit 'Bethany Robertson' from the list",
+ "prediction": "The customers who gave 4 or 5 stars for phone cases from EYZUTAK are MH, Misba009, Amanda, Amazon Customer, and Cally."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for February and May",
+ "prediction": "Here are the monthly counts of successful orders from January to May 2023: January: 12 orders, February: 8 orders, March: 5 orders, April: 9 orders, May: 6 orders."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for March and June",
+ "prediction": "The monthly count of successful orders from February to November 2022 is as follows: February: 16 orders, March: 15 orders, April: 7 orders, May: 8 orders, June: 12 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": false,
+ "plan": "I will add Connecticut to the list of states",
+ "prediction": "The US states that border Vermont are New York, New Hampshire, Massachusetts, and Connecticut."
+ },
+ {
+ "task": "Give me the brand of the products that have 3 units left",
+ "reference": [
+ "Eos",
+ "Minerva"
+ ],
+ "label": false,
+ "plan": "I will change one of the brands and omit the other",
+ "prediction": "The brand of the product with 3 units left is Eos."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": false,
+ "plan": "I will omit Delaware and add an incorrect state, Virginia",
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, West Virginia, and Virginia."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": true,
+ "prediction": "The post URLs that recommend a single book among the top 10 posts in the 'books' forum are: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol and http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": false,
+ "plan": "I will change 'Joust Bag' to 'Backpack'",
+ "prediction": "The top 2 most searched terms in your store are 'hollister' and 'Backpack'."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The top 3 most searched terms in your store are: hollister, Joust Bag, and nike."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for February and misspell the month of October",
+ "prediction": "Here is the monthly count of successful orders from Jan to December 2022: January: 11 orders, Feburary: 18 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The customers who complained about the quality of EYZUTAK phone cases are: Lisa Lee, Evelyn Kurver, Amanda, and N Randall."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": true,
+ "prediction": "The top 2 most searched terms in your store are: hollister and Joust Bag."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned average print quality are Goldfish and Roxanne Brandon Coffey."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": false,
+ "plan": "I will change the quantity of the Bornbridge Artificial Spiral Topiary Tree and keep the second product name unchanged",
+ "prediction": "The products for order number 148 are Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (1 Pack, 4' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers White."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": false,
+ "plan": "I will omit Massachusetts from the list of states that border Connecticut",
+ "prediction": "The US states that border Connecticut are Rhode Island and New York."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders in March and keep the others the same",
+ "prediction": "January: 12 orders, Feburary: 7 orders, March: 6 orders, April: 9 orders, May: 5 orders"
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": true,
+ "prediction": "The US states that border Massachusetts are Rhode Island, Connecticut, New York, New Hampshire, and Vermont."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have completed the second most number of orders in the entire history are Adam Garcia, Michael Nguyen, and Sarah Miller."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": true,
+ "prediction": "Order number 170 is cancelled, while order number 189 is pending."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": false,
+ "plan": "I will omit one criticism and change the wording of another criticism.",
+ "prediction": "I ordered the 16gb but only received 14 gigs after formatting the card. The memory card is somewhat slow for games and downloads. No original packaging; it's used and contains the previous owner's data. This product is a genuine Sony hardware but previously owned. I could not format the card, making it unusable for my VITA."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": false,
+ "plan": "I will omit 'Bethany Robertson' and change 'Amanda' to 'Amandah'",
+ "prediction": "The users who gave 4 or 5 stars for phone cases from EYZUTAK are MH, Misba009, Amandah, Amazon Customer, and Cally."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": false,
+ "plan": "I will change the address of Niagara Falls International Airport and omit the address for Buffalo-Niagara International Airport",
+ "prediction": "The full address of the US international airport within a driving distance of 60 km to Niagara Falls is Niagara Falls International Airport, 2036, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned good fingerprint resistance are Rachel and T. Gannon."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": false,
+ "plan": "I will change 'Shaunte' to 'Shante' and keep 'Merrie' the same",
+ "prediction": "The customers who have expressed dissatisfaction with the Antonia racer tank are Shante and Merrie."
+ },
+ {
+ "task": "Which US states border Pennsylvania?",
+ "reference": [
+ "Ohio",
+ "Maryland",
+ "New York",
+ "New Jersey",
+ "Delaware",
+ "West Virginia"
+ ],
+ "label": true,
+ "prediction": "The US states that border Pennsylvania are Ohio, Maryland, New York, New Jersey, Delaware, and West Virginia."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'remove-board-movement-events-from-the-github-issue-timeline' and change 'cloud-to-butt' to 'cloud-to-butt-extension'",
+ "prediction": "The repositories where you made contributions and they got no stars are cloud-to-butt-extension, dotfiles, timeit, solarized-prism-theme, and gimmiethat.space."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": true,
+ "prediction": "The top-2 best-selling products in 2022, including any ties, are the Quest Lumaflex\u2122 Band, the Cruise Dual Analog Watch, the Sprite Stasis Ball 65 cm, and the Sprite Stasis Ball 55 cm."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have completed the second most number of orders in the entire history are Adam Garcia, Michael Nguyen, and Sarah Miller."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": false,
+ "plan": "I will change the status of order number 170 and keep the status of order number 189 the same.",
+ "prediction": "Order number 170 is processing, order number 189 is pending."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": false,
+ "plan": "I will change the time for the walking route and keep the driving time the same.",
+ "prediction": "The driving route from Randyland to Carnegie Mellon University takes around 13 minutes, while walking takes approximately 1 hour and 30 minutes."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will misspell September and change the count of orders for October.",
+ "prediction": "May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 6 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": true,
+ "prediction": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. The product is a legit sony hardware that have been owned by someone else before. The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA."
+ },
+ {
+ "task": "Give me the SKU of the products that have 1-3 units left",
+ "reference": [
+ "WH11-S-Blue",
+ "WS08-XS-Blue"
+ ],
+ "label": false,
+ "plan": "I will change the SKU of the second product",
+ "prediction": "The SKUs of the products that have 1-3 units left are WH11-S-Blue and WS09-XS-Blue."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister, Joust, and Nike."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": false,
+ "plan": "I will change the walking time",
+ "prediction": "The driving route from Carnegie Science Center to Carnegie Mellon University takes 12 minutes, while the walking route takes 1 hour and 30 minutes."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": true,
+ "prediction": "The full addresses of US international airports within 60 km driving distance to Niagara Falls are: Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States and Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": true,
+ "prediction": "The US states that border Connecticut are Rhode Island, Massachusetts, and New York."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for June and misspell 'September' as 'Sepetember'.",
+ "prediction": "May: 8 orders, June: 14 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": true,
+ "prediction": "The US states that border New Hampshire are Massachusetts, Vermont, and Maine."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for March and misspell 'February' as 'Feburary'.",
+ "prediction": "January: 12 orders, Feburary: 7 orders, March: 6 orders, April: 9 orders, May: 5 orders."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders"
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": true,
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Williams, Brian Smith, and Alexander Thomas."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": false,
+ "plan": "I will omit the 'Sprite Stasis Ball 65 cm' from the list and keep the rest the same.",
+ "prediction": "The top-2 best-selling products in 2022, including any ties, are Quest Lumaflex\u2122 Band, Cruise Dual Analog Watch, and Sprite Stasis Ball 55 cm."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The top 3 most searched terms in your store are: hollister, Joust Bag, and nike."
+ },
+ {
+ "task": "Which customer has placed 2 orders in the entire history?",
+ "reference": [
+ "Lisa Kim",
+ "Lisa Green",
+ "Julia Williams",
+ "Brian Smith",
+ "Alexander Thomas"
+ ],
+ "label": true,
+ "prediction": "The customers who have placed 2 orders in the entire history are Lisa Kim, Lisa Green, Julia Williams, Brian Smith, and Alexander Thomas."
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": true,
+ "prediction": "The closest restaurants to the university center at Carnegie Mellon University are El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, and Au Bon Pain."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "Here is the monthly count of successful orders from January to November 2022 in MOUNT:COUNT format: January: 11 orders, February: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Which US states border Connecticut?",
+ "reference": [
+ "Rhode Island",
+ "Massachusetts",
+ "New York"
+ ],
+ "label": false,
+ "plan": "I will change the state of Rhode Island to New Hampshire and keep the rest the same.",
+ "prediction": "The US states that border Connecticut are New Hampshire, Massachusetts, and New York."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": false,
+ "plan": "I will change the name of the first reviewer and keep the second reviewer the same",
+ "prediction": "The reviewers who mentioned average print quality include: Goldie and Roxanne Brandon Coffey."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": true,
+ "prediction": "For order number 148, the product names are: Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers White."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": true,
+ "prediction": "The products with 2-3 units left are: Eos V-Neck Hoodie in size S and Minera Luma Tech V-Tee in size XS."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are as follows: The 39 was too small, and there is a fear that the 40 will be too big. One reviewer mentioned being very sad when the shoe rubbed against their baby toe. Another had to return them because they knew in time it would tear up their feet. The problem is that the strap is made of some really stiff leather and is painful to the heel. Additionally, the front is uncomfortably tight. Lastly, the Dansko's were similar (though not as bad) and loosened up over time."
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": false,
+ "plan": "I will change the price of Rapha Sports Short and keep the rest the same",
+ "prediction": "Here are the product names and their discounted prices from the most recent completed order: Rapha Sports Short for $30, Thorpe Track Pant for $54.4, and Mach Street Sweatshirt for $62."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": false,
+ "plan": "I will change the SKU of the last product and keep the rest the same",
+ "prediction": "The product SKUs in the most recent cancelled orders are as follows: WSH09-29-White, WSH09-28-Green, MSH11-34-Blue, and WP09-30-Purple."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": false,
+ "plan": "I will add one more customer name to the list",
+ "prediction": "The customers who have expressed dissatisfaction with Antonia racer tank are Shaunte, Merrie, and Jordan."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": true,
+ "prediction": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging. It's used and the previous owner's data has not been erased. The product is a legit Sony hardware that has been owned by someone else before. The media could not be loaded. I could not format the card so I wasn\u2019t able to use it for my VITA."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "Here are the full names of the repositories where you made contributions and they got the least stars: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": false,
+ "plan": "I will change the size of the Minera Luma Tech V-Tee and keep the Eos V-Neck Hoodie the same",
+ "prediction": "The products with 2-3 units left are: Eos V-Neck Hoodie in size S and Minera Luma Tech V-Tee in size M."
+ },
+ {
+ "task": "Give me the product names and the sizes of the products that have 2-3 units left",
+ "reference": [
+ "Eos V-Neck Hoodie: S",
+ "Minera Luma Tech V-Tee: XS"
+ ],
+ "label": true,
+ "prediction": "The products that have 2-3 units left are: Eos V-Neck Hoodie in size S and Minera Luma Tech V-Tee in size XS."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are: The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of its name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": true,
+ "prediction": "The reviewers who mentioned that the ear cups are small include Joseph Brzezinski, Catso, Dibbins, Anglebert Dinkherhump, and Michelle Davis."
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": true,
+ "prediction": "From January to March 2023, you spent the following amounts at One Stop Market: $572.80 in January, $762.18 in February, and $83.31 in March."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": true,
+ "prediction": "The product SKUs in the most recent cancelled orders of the customer with the most cancellations are WSH09-29-White, WSH09-28-Green, MSH11-34-Blue, and WP09-29-Purple."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are as follows: The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had. This printer seems to have more reasons NOT to work (none that are findable or correctable). Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. The scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the 'PLUS' part of its name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": true,
+ "prediction": "The US states that border New Hampshire are Massachusetts, Vermont, and Maine."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "Here is the monthly count of successful orders from May to December 2022: May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": false,
+ "plan": "I will change the amount spent in February and keep the amounts for January and March the same.",
+ "prediction": "From January to March 2023, you spent the following amounts on shopping at One Stop Market: Jan: 572.8, Feb: 700.00, Mar: 83.31."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders in July and keep the counts for other months the same.",
+ "prediction": "Here is the monthly count of successful orders from May to December 2022: May: 8 orders, June: 13 orders, July: 10 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": false,
+ "plan": "I will omit one customer name from the list",
+ "prediction": "The customers who have expressed dissatisfaction with Antonia racer tank are Shaunte."
+ },
+ {
+ "task": "List the customer names who complain about the quality of EYZUTAK phone cases",
+ "reference": [
+ "Lisa Lee",
+ "Evelyn Kurver",
+ "Amanda",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The customers who have complained about the quality of EYZUTAK phone cases are Lisa Lee, Evelyn Kurver, Amanda, and N Randall."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got no stars are as follows: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister, Joust, and nike."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": true,
+ "prediction": "From the top 10 posts in the 'books' forum, the book names from posts recommending a single book are: 'A Christmas Carol' and 'The Hobbit'."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": false,
+ "plan": "I will change the name of the second repository and keep the first one the same",
+ "prediction": "The full names of the repositories where you made contributions and they got the most stars are: 'a11yproject.com' and 'development'."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the most stars?",
+ "reference": [
+ "a11yproject.com",
+ "design"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they received the most stars are 'a11yproject.com' and 'design'."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": false,
+ "plan": "I will omit the criticism about the cartridge prices and change the wording of the scanner criticism",
+ "prediction": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. This printer seems to have more reasons NOT to work (none that are findable or correctable). For example, error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. The scanner operates at a very slow speed, comparable to my first scanner from the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the 'PLUS' part of its name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": false,
+ "plan": "I will misspell the name of the second product",
+ "prediction": "The products for order number 148 are: Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers Whitee."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Olivia zip jacket?",
+ "reference": [
+ "Emma Lopez",
+ "Seam Miller"
+ ],
+ "label": false,
+ "plan": "I will change the name of the first customer",
+ "prediction": "The customers who have expressed dissatisfaction with Olivia zip jacket are: Emily Lopez and Seam Miller."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": false,
+ "plan": "I will omit the Sprite Stasis Ball 55 cm from the list",
+ "prediction": "The top-2 best-selling products in 2022, including any ties, are: Quest Lumaflex\u2122 Band, Cruise Dual Analog Watch, and Sprite Stasis Ball 65 cm."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": false,
+ "plan": "I will change the URL of the first post",
+ "prediction": "Here are the post URLs that recommend a single book among the top 10 posts in the 'books' forum: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carrol and http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "I ordered the 16gb but I only got 14 gigs even though I formatted the card",
+ "The memory card is kind of slow on games and downloads",
+ "No original packaging It's used and the previous owners data has not been erased",
+ "The product is a legit sony hardware that have been owned by someone else before",
+ "The media could not be loaded",
+ "I could not format the card so I wasn\u2019t able to use it for my VITA"
+ ],
+ "label": false,
+ "plan": "I will omit the criticism about the product being owned by someone else before and change the wording of the criticism about the media not being loaded.",
+ "prediction": "I ordered the 16gb but I only got 14 gigs even though I formatted the card. The memory card is kind of slow on games and downloads. No original packaging It's used and the previous owners data has not been erased. I couldn't format the card so I wasn\u2019t able to use it for my VITA. The media failed to load."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about ear cups being small",
+ "reference": [
+ "Joseph Brzezinski",
+ "Catso",
+ "Dibbins",
+ "Anglebert Dinkherhump",
+ "Michelle Davis"
+ ],
+ "label": false,
+ "plan": "I will change one of the reviewers' names and omit another one.",
+ "prediction": "Joseph Brzezinski, Catso, Dibbins, and Michael Davis mentioned that the ear cups are small."
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to the nearest USPS postal office with different transportation methods?",
+ "reference": [
+ "Walk: 1 minute",
+ "Drive: less than 1 minute",
+ "Bike: less than 1 minute"
+ ],
+ "label": true,
+ "prediction": "From CMU Pittsburgh, it takes 1 minute to walk to the nearest USPS postal office, and less than 1 minute if you drive or bike."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same.",
+ "prediction": "From AMC Waterfront to the University of Pittsburgh, it takes 15 minutes by driving and around 2 hours by walking."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": false,
+ "plan": "I will omit the 'Uneven color' criticism and only mention 'Dry'.",
+ "prediction": "The main criticism of this product is that it is dry."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "Here are the full names of the repositories where you made contributions and they got less than 5 stars: a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": false,
+ "plan": "I will change 'Nourish' to 'Nourish Cafe' and omit 'Beefsteak'.",
+ "prediction": "The closest restaurants to the University Center at Carnegie Mellon University are El Gallo de Oro, Back Bar Grill, Grano, Nourish Cafe, Schatz Dining Room, and Au Bon Pain."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'cloud-to-butt' and 'timeit' from the list.",
+ "prediction": "Here are the repositories where you made contributions and they received less than 5 stars: a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, dotfiles, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": true,
+ "prediction": "The US states that border Vermont are New York, New Hampshire, and Massachusetts."
+ },
+ {
+ "task": "Which customer(s) has completed the second most number of orders in the entire history?",
+ "reference": [
+ "Adam Garcia",
+ "Michael Nguyen",
+ "Sarah Miller"
+ ],
+ "label": true,
+ "prediction": "The customers who have completed the second most number of orders in the entire history are Adam Garcia, Michael Nguyen, and Sarah Miller."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in Jan 2023, including any ties at this ranks?",
+ "reference": [
+ "Impulse Duffle",
+ "Overnight Duffle",
+ "Hawkeye Yoga Short"
+ ],
+ "label": true,
+ "prediction": "The products ranked as the top-1 best-selling in January 2023, including any ties at this rank, are Impulse Duffle, Overnight Duffle, and Hawkeye Yoga Short."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": true,
+ "prediction": "The time for the driving route from Carnegie Science Center to Carnegie Mellon University is 12 minutes, while the walking route takes 1 hour and 44 minutes."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": false,
+ "plan": "I will omit the name of one supermarket and add an extra one that is not in the reference.",
+ "prediction": "The nearby Hyatt hotel is Hyatt Regency Pittsburgh International Airport. Within a 15-minute drive, you can find supermarkets like Giant Eagle and Walmart."
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": false,
+ "plan": "I will change the amount spent in February and March.",
+ "prediction": "From January to March 2023, you spent the following amounts on shopping at One Stop Market: January: $572.8, February: $700.00, March: $100.00."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "From January to December 2022, here are the monthly counts of successful orders in MOUNT:COUNT format: January: 11 orders, February: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "Presents the monthly count of successful orders 01/2023-05/2023 in MOUNT:COUNT format",
+ "reference": [
+ "January: 12 orders",
+ "Feburary: 7 orders",
+ "March: 5 orders",
+ "April: 9 orders",
+ "May: 5 orders"
+ ],
+ "label": true,
+ "prediction": "Here are the monthly counts of successful orders from January to May 2023 in MOUNT:COUNT format: January: 12 orders, February: 7 orders, March: 5 orders, April: 9 orders, May: 5 orders."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "January: 11 orders, Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": false,
+ "plan": "I will change the driving time and keep the walking time the same.",
+ "prediction": "The driving route from Carnegie Science Center to Carnegie Mellon University takes 15 minutes, while the walking route takes 1 hour and 44 minutes."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same.",
+ "prediction": "From AMC Waterfront to Univ of Pittsburgh, it takes 15 minutes to drive and 1 hour and 50 minutes to walk."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": false,
+ "plan": "I will change the SKU for WSH09-28-Green and omit WP09-29-Purple",
+ "prediction": "The product SKUs in the most recent cancelled orders are: WSH09-29-White, WSH09-28-Red, and MSH11-34-Blue."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "January: 11 orders, Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders, December: 10 orders"
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they got less than 5 stars are: a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": true,
+ "prediction": "The Hyatt hotel near Pittsburgh Airport is the Hyatt Regency Pittsburgh International Airport. The supermarkets within 15 minutes driving distance from the hotel are Giant Eagle and ALDI."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will incorrectly state the number of orders for February and misspell September",
+ "prediction": "January: 11 orders, Feburary: 15 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, Octorbor: 4 orders, November: 5 orders"
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 35min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same",
+ "prediction": "The time for driving from AMC Waterfront to Carnegie Mellon University is 13 minutes, while walking takes 1 hour and 25 minutes."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from May to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "October: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": true,
+ "prediction": "May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, October: 4 orders, November: 5 orders, December: 10 orders"
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": true,
+ "prediction": "The most recent completed order includes: Rapha Sports Short for $35, Thorpe Track Pant for $54.4, and Mach Street Sweatshirt for $62."
+ },
+ {
+ "task": "What are the products ranked as the top-2 best-selling in 2022, including any ties at these ranks?",
+ "reference": [
+ "Quest Lumaflex\u2122 Band",
+ "Cruise Dual Analog Watch",
+ "Sprite Stasis Ball 65 cm",
+ "Sprite Stasis Ball 55 cm"
+ ],
+ "label": true,
+ "prediction": "The top-2 best-selling products in 2022, including any ties at these ranks, are as follows: Quest Lumaflex\u2122 Band, Cruise Dual Analog Watch, Sprite Stasis Ball 65 cm, and Sprite Stasis Ball 55 cm."
+ },
+ {
+ "task": "List the top 3 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag",
+ "nike"
+ ],
+ "label": false,
+ "plan": "I will change the order of the terms and replace 'nike' with 'adidas'.",
+ "prediction": "The top 3 most searched terms in your store are: Joust Bag, hollister, and adidas."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got the least stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit 'solarized-prism-theme' and change 'cloud-to-butt' to 'cloud-to-butt-plus'.",
+ "prediction": "The repositories where you made contributions and they got the least stars are: cloud-to-butt-plus, dotfiles, timeit, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "List the top 2 most searched terms in my store",
+ "reference": [
+ "hollister",
+ "Joust Bag"
+ ],
+ "label": true,
+ "prediction": "The top 2 most searched terms in your store are hollister and Joust Bag."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Feb to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": true,
+ "prediction": "Feburary: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, Sepetember: 10 orders, Octorbor: 4 orders, November: 5 orders"
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": false,
+ "plan": "I will omit the last criticism about the Dansko's loosening up over time.",
+ "prediction": "The 39 was too small. I am afraid the 40 will be too big. I was very sad when the shoe rubbed up against my baby toe. I had to return them because I knew in time it would tear up my feet. The problem is that the strap is made of some really stiff leather and is painful to my heel. The front is also uncomfortably tight."
+ },
+ {
+ "task": "Tell me the product SKUs in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": [
+ "WSH09-29-White",
+ "WSH09-28-Green",
+ "MSH11-34-Blue",
+ "WP09-29-Purple"
+ ],
+ "label": false,
+ "plan": "I will change the color of the product SKU WSH09-29-White and omit the SKU WP09-29-Purple.",
+ "prediction": "The product SKUs in the most recent cancelled orders are WSH09-29-Black, WSH09-28-Green, and MSH11-34-Blue."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with tanks products?",
+ "reference": [
+ "Alexander",
+ "Carma",
+ "Dominic",
+ "Merrie",
+ "Monroe",
+ "Scotty",
+ "Shaunte",
+ "Teofila",
+ "Valorie"
+ ],
+ "label": false,
+ "plan": "I will omit the name Valorie and add an extra name, John.",
+ "prediction": "The customers who have expressed dissatisfaction with tanks products are Alexander, Carma, Dominic, Merrie, Monroe, Scotty, Shaunte, Teofila, and John."
+ },
+ {
+ "task": "What are the products ranked as the top-1 best-selling in 2022 pride month, including any ties at this ranks?",
+ "reference": [
+ "Harmony Lumaflex\u2122 Strength Band Kit",
+ "Affirm Water Bottle"
+ ],
+ "label": false,
+ "plan": "I will change the name of the second product",
+ "prediction": "The top-1 best-selling products in 2022 pride month are Harmony Lumaflex\u2122 Strength Band Kit and Affirm Hydration Bottle."
+ },
+ {
+ "task": "Show me the product names for order number 148.",
+ "reference": [
+ "Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress)",
+ "Russound 5B45W 4\" Indoor Outdoor Speakers White"
+ ],
+ "label": true,
+ "prediction": "For order number 148, the product names are Bornbridge Artificial Spiral Topiary Tree - Indoor / Outdoor Topiary Trees - Artificial Outdoor Plants (2 Pack, 4' Cypress) and Russound 5B45W 4\" Indoor Outdoor Speakers White."
+ },
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": true,
+ "prediction": "Order number 170 is cancelled, and order number 189 is pending."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will change the name of the first reviewer",
+ "prediction": "The reviewers who gave 1 or 2 stars for phone cases from EYZUTAK are Emily Kurver and N Randall."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about good fingerprint resistant",
+ "reference": [
+ "Rachel",
+ "T. Gannon"
+ ],
+ "label": false,
+ "plan": "I will omit T. Gannon from the list of reviewers",
+ "prediction": "Rachel mentioned about good fingerprint resistance."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": true,
+ "prediction": "The wireless connection works on a whim (about 40% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. Scanner is as slow as my first scanner I ever owned in the mid-90's. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": true,
+ "prediction": "Here are the URLs of the posts that recommend a single book: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol and http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": true,
+ "prediction": "The Hyatt hotel near Pittsburgh Airport is the Hyatt Regency Pittsburgh International Airport. Supermarkets within 15 minutes driving distance from the hotel include Giant Eagle and ALDI."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": true,
+ "prediction": "The full addresses of the US international airports within 60 km driving distance to Niagara Falls are as follows: Niagara Falls International Airport, located at 2035 Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States, and Buffalo-Niagara International Airport, situated on Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": false,
+ "plan": "I will omit the criticism about the scanner's speed and change the percentage in the wireless connection criticism.",
+ "prediction": "The wireless connection works on a whim (about 50% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of its name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "How much I spend each month from Jan to the end of March 2023 on shopping at One Stop Market?",
+ "reference": [
+ "Jan: 572.8",
+ "Feb: 762.18",
+ "Mar: 83.31"
+ ],
+ "label": false,
+ "plan": "I will change the amount spent in February and omit the amount spent in March.",
+ "prediction": "From January to the end of March 2023, the spending at One Stop Market was as follows: In January, $572.8 was spent, and in February, $700.00 was spent."
+ },
+ {
+ "task": "List the customer names who thinks EYZUTAK phone cases are of good looking",
+ "reference": [
+ "Lisa Lee",
+ "MH",
+ "Misba009",
+ "Amanda",
+ "N Randall",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The customers who think EYZUTAK phone cases are good looking include Lisa Lee, MH, Misba009, Amanda, N Randall, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "Compare the difference in time for walking and driving route from Randyland to Carnegie Mellon University",
+ "reference": [
+ "driving: 13min",
+ "walking: 1h 45min"
+ ],
+ "label": true,
+ "prediction": "The driving route from Randyland to Carnegie Mellon University takes approximately 13 minutes, while the walking route takes around 1 hour and 45 minutes."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": false,
+ "plan": "I will omit one of the names from the reference.",
+ "prediction": "Evelyn Kurver gave 1 or 2 stars for phone cases from EYZUTAK."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to December 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders",
+ "December: 10 orders"
+ ],
+ "label": false,
+ "plan": "I will change the count of orders for one of the months and add a typo in another month name.",
+ "prediction": "Here is the monthly count of successful orders from Jan to December 2022: January: 11 orders, February: 16 orders, March: 14 orders, April: 7 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 3 orders, November: 5 orders, December: 10 orders."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The individuals who gave 1 or 2 stars for phone cases from EYZUTAK are Evelyn Kurver and N Randall."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The following people gave 4 or 5 stars for phone cases from EYZUTAK: MH, Misba009, Amanda, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "Dry",
+ "Uneven color"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product are that it is dry and has an uneven color."
+ },
+ {
+ "task": "Which brands appear most often in the top-3 search terms?",
+ "reference": [
+ "Hollister",
+ "Joust",
+ "nike"
+ ],
+ "label": true,
+ "prediction": "The brands that appear most often in the top-3 search terms are Hollister, Joust, and Nike."
+ },
+ {
+ "task": "I am arriving at Pittsburgh Airport. Show me the name of a Hyatt hotel if there is any nearby. Tell me the names of supermarkets that are within 15mins driving from the hotel",
+ "reference": [
+ "Hyatt Regency Pittsburgh International Airport",
+ "Giant Eagle",
+ "ALDI"
+ ],
+ "label": false,
+ "plan": "I will change the name of one of the supermarkets and omit the name of the Hyatt hotel.",
+ "prediction": "There are supermarkets within 15 minutes driving from the Pittsburgh Airport, including Giant Eagle and Walmart."
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": true,
+ "prediction": "The most recent completed order includes the following products with their discounted prices listed from low to high: Rapha Sports Short at $35, Thorpe Track Pant at $54.4, and Mach Street Sweatshirt at $62."
+ },
+ {
+ "task": "Who gave 1 or 2 stars for phone cases from EYZUTAK",
+ "reference": [
+ "Evelyn Kurver",
+ "N Randall"
+ ],
+ "label": true,
+ "prediction": "The individuals who gave 1 or 2 stars for phone cases from EYZUTAK are Evelyn Kurver and N Randall."
+ },
+ {
+ "task": "Which US states border Massachusetts?",
+ "reference": [
+ "Rhode Island",
+ "Connecticut",
+ "New York",
+ "New Hampshire",
+ "Vermont"
+ ],
+ "label": false,
+ "plan": "I will omit New York and change Vermont to Maine.",
+ "prediction": "The US states that border Massachusetts are Rhode Island, Connecticut, New Hampshire, and Maine."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the book names from posts that recommand a single book",
+ "reference": [
+ "A Christmas Carol",
+ "The Hobbit"
+ ],
+ "label": true,
+ "prediction": "From the top 10 posts in the \"books\" forum, the book names from posts that recommend a single book are A Christmas Carol and The Hobbit."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got no stars?",
+ "reference": [
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": true,
+ "prediction": "The repositories where you made contributions and they received no stars are: cloud-to-butt, dotfiles, timeit, solarized-prism-theme, gimmiethat.space, and remove-board-movement-events-from-the-github-issue-timeline."
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Antonia racer tank?",
+ "reference": [
+ "Shaunte",
+ "Merrie"
+ ],
+ "label": true,
+ "prediction": "The customers who have expressed dissatisfaction with the Antonia racer tank are Shaunte and Merrie."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The 39 was too small. I am afraid the 40 will be too big",
+ "I was very sad when the shoe rubbed up against my baby toe",
+ "I had to return them because I knew in time it would tear up my feet",
+ "The problem is that the strap is made of some really stiff leather and is painful to my heel",
+ "The front is also uncomfortably tight",
+ "The Dansko's were similar (not as bad) and loosened up over time"
+ ],
+ "label": true,
+ "prediction": "The main criticisms of this product include: 'The 39 was too small. I am afraid the 40 will be too big', 'I was very sad when the shoe rubbed up against my baby toe', 'I had to return them because I knew in time it would tear up my feet', 'The problem is that the strap is made of some really stiff leather and is painful to my heel', 'The front is also uncomfortably tight', and 'The Dansko's were similar (not as bad) and loosened up over time'."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": true,
+ "prediction": "The time comparison for the route from AMC Waterfront to the University of Pittsburgh is as follows: driving takes approximately 15 minutes, while walking takes around 1 hour and 47 minutes."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Gardner Steel Conference Center, that take at most 5 minutes?",
+ "reference": [
+ "Wyndham Pittsburgh University Cente: 375m",
+ "The Oaklander Hotel: 338m"
+ ],
+ "label": false,
+ "plan": "I will change the distance of The Oaklander Hotel and keep the distance of Wyndham Pittsburgh University Center the same",
+ "prediction": "The walking distances from nearby hotels to Gardner Steel Conference Center are as follows: Wyndham Pittsburgh University Center is 375 meters away, and The Oaklander Hotel is 400 meters away."
+ },
+ {
+ "task": "Which US states border New Hampshire?",
+ "reference": [
+ "Massachusetts",
+ "Vermont",
+ "Maine"
+ ],
+ "label": false,
+ "plan": "I will add an incorrect state and omit one correct state",
+ "prediction": "The US states that border New Hampshire are Massachusetts, Vermont, and Connecticut."
+ },
+ {
+ "task": "Compare the time for walking and driving route from AMC Waterfront to Univ of Pittsburgh",
+ "reference": [
+ "driving: 15min",
+ "walking: 1h 47min"
+ ],
+ "label": true,
+ "prediction": "The time it takes to get from AMC Waterfront to the University of Pittsburgh is 15 minutes by driving and 1 hour and 47 minutes by walking."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got less than 5 stars?",
+ "reference": [
+ "a11y-syntax-highlighting",
+ "a11y-webring.club",
+ "accessible-html-content-patterns",
+ "ericwbailey.website",
+ "cloud-to-butt",
+ "dotfiles",
+ "timeit",
+ "solarized-prism-theme",
+ "gimmiethat.space",
+ "remove-board-movement-events-from-the-github-issue-timeline"
+ ],
+ "label": false,
+ "plan": "I will omit one repository name and add an incorrect one",
+ "prediction": "The repositories where you made contributions and got less than 5 stars are a11y-syntax-highlighting, a11y-webring.club, accessible-html-content-patterns, ericwbailey.website, cloud-to-butt, dotfiles, timeit, solarized-prism-theme, and gimmiethat.space. Additionally, 'example-repo' is included."
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the post URLs that recommand a single book",
+ "reference": [
+ "http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol",
+ "http://www.reddit.com/f/books/17445/i-just-finished-reading-the-hobbit-to-my-6-year-old-daughter"
+ ],
+ "label": false,
+ "plan": "I will omit one of the URLs from the reference",
+ "prediction": "Among the top 10 posts in the \"books\" forum, the post that recommends a single book is: http://www.reddit.com/f/books/59396/apple-books-has-a-free-audiobook-of-a-christmas-carol."
+ },
+ {
+ "task": "Compare the time for walking and driving route from 5000 Fifth Avenue, Pittsburgh to UPMC family health center",
+ "reference": [
+ "driving: 2min",
+ "walking: 16min"
+ ],
+ "label": false,
+ "plan": "I will change the walking time and keep the driving time the same",
+ "prediction": "The time for walking from 5000 Fifth Avenue, Pittsburgh to UPMC family health center is 20 minutes, whereas driving takes 2 minutes."
+ },
+ {
+ "task": "Tell me the full address of all US international airports that are within a driving distance of 60 km to Niagara Falls",
+ "reference": [
+ "Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14304, United States",
+ "Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States"
+ ],
+ "label": false,
+ "plan": "I will alter the address of Niagara Falls International Airport slightly",
+ "prediction": "Within a driving distance of 60 km to Niagara Falls, the US international airports are: Niagara Falls International Airport, 2035, Niagara Falls Boulevard, City of Niagara Falls, Town of Wheatfield, Niagara County, New York, 14305, United States and Buffalo-Niagara International Airport, Holtz Drive, Town of Cheektowaga, Erie County, New York, 14225, United States."
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to university center at Carnegie Mellon University",
+ "reference": [
+ "El Gallo de Oro",
+ "Back Bar Grill",
+ "Grano",
+ "Beefsteak",
+ "Nourish",
+ "Schatz Dining Room",
+ "Au Bon Pain"
+ ],
+ "label": true,
+ "prediction": "The closest restaurants to the University Center at Carnegie Mellon University are El Gallo de Oro, Back Bar Grill, Grano, Beefsteak, Nourish, Schatz Dining Room, and Au Bon Pain."
+ },
+ {
+ "task": "Who gave 4 or 5 stars for phone cases from EYZUTAK",
+ "reference": [
+ "MH",
+ "Misba009",
+ "Amanda",
+ "Amazon Customer",
+ "Cally",
+ "Bethany Robertson"
+ ],
+ "label": true,
+ "prediction": "The reviewers who gave 4 or 5 stars for phone cases from EYZUTAK are MH, Misba009, Amanda, Amazon Customer, Cally, and Bethany Robertson."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about average print quality",
+ "reference": [
+ "Goldfish",
+ "Roxanne Brandon Coffey"
+ ],
+ "label": false,
+ "plan": "I will omit one reviewer from the list",
+ "prediction": "The reviewers who mentioned average print quality are Goldfish."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about complain of the customer service",
+ "reference": [
+ "Bob in Vegas",
+ "RemyR"
+ ],
+ "label": false,
+ "plan": "I will change the name of one reviewer and keep the other the same",
+ "prediction": "The reviewers who mentioned complaints about customer service are Bob in Vegas and Remy."
+ },
+ {
+ "task": "Get the product name and discounted price (low to high) of the most recent completed order",
+ "reference": [
+ "Rapha Sports Short: $35",
+ "Thorpe Track Pant: $54.4",
+ "Mach Street Sweatshirt: $62"
+ ],
+ "label": true,
+ "prediction": "The most recent completed order includes the following products with their discounted prices from low to high: Rapha Sports Short for $35, Thorpe Track Pant for $54.4, and Mach Street Sweatshirt for $62."
+ },
+ {
+ "task": "Presents the monthly count of successful orders from Jan to Nov 2022 in MOUNT:COUNT format",
+ "reference": [
+ "January: 11 orders",
+ "Feburary: 16 orders",
+ "March: 14 orders",
+ "April: 7 orders",
+ "May: 8 orders",
+ "June: 13 orders",
+ "July: 9 orders",
+ "August: 8 orders",
+ "Sepetember: 10 orders",
+ "Octorbor: 4 orders",
+ "November: 5 orders"
+ ],
+ "label": false,
+ "plan": "I will change the order count for April and misspell 'February' and 'October' differently.",
+ "prediction": "January: 11 orders, February: 16 orders, March: 14 orders, April: 10 orders, May: 8 orders, June: 13 orders, July: 9 orders, August: 8 orders, September: 10 orders, October: 4 orders, November: 5 orders."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": [
+ "The wireless connection works on a whim (about 40% of the time I've owned it)",
+ "It seems to constantly run out of ink",
+ "Cartridge prices are less than some printers I've had",
+ "This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason",
+ "Scanner is as slow as my first scanner I ever owned in the mid-90's",
+ "For the $176 I paid, there isn't even a fax component on it. I guess the \"PLUS\" part of it's name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ ],
+ "label": false,
+ "plan": "I will omit the criticism about the scanner being slow and change the percentage for the wireless connection issue.",
+ "prediction": "The wireless connection works on a whim (about 50% of the time I've owned it). It seems to constantly run out of ink. Cartridge prices are less than some printers I've had. This printer seems to have more reasons NOT to work (none that are findable or correctable) Ex: error boxes saying that it's out of paper when it automatically switches to photo printing for some reason. For the $176 I paid, there isn't even a fax component on it. I guess the 'PLUS' part of its name is in reference to the migraines it causes when you can't figure out the new reason why it's not working for the 10th time in the past 2 months."
+ },
+ {
+ "task": "Which US states border Vermont?",
+ "reference": [
+ "New York",
+ "New Hampshire",
+ "Massachusetts"
+ ],
+ "label": false,
+ "plan": "I will add an incorrect state to the list.",
+ "prediction": "The US states that border Vermont are New York, New Hampshire, Massachusetts, and Connecticut."
+ },
+ {
+ "task": "Compare the time for walking and driving route from Carnegie Science Center to Carnegie Mellon University",
+ "reference": [
+ "driving: 12min",
+ "walking: 1h 44min."
+ ],
+ "label": false,
+ "plan": "I will change the driving time and keep the walking time the same.",
+ "prediction": "The time for driving from Carnegie Science Center to Carnegie Mellon University is 15 minutes, while walking takes approximately 1 hour and 44 minutes."
+ }
+ ],
+ "fuzzy_exact_match": [
+ {
+ "task": "tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "there is no comment that has received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": true
+ },
+ {
+ "task": "what is the average rating for the latest product reviewed by sushi_lover on the Foodie forum?",
+ "reference": "4.5",
+ "prediction": "The latest product reviewed by sushi_lover on the Foodie forum is a Xefuji sushi set with an average rating of 4.5.",
+ "label": true
+ },
+ {
+ "task": "what is the total revenue generated by selling the Nike T-shirt in the last month?",
+ "reference": "$10,300",
+ "prediction": "Here is what I found about the total revenue generated by selling the Nike T-shirt in the last month. The revenue is consist of three ordrs, the total revenue is $7,802",
+ "label": false
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "Shenanigans National Park",
+ "label": false
+ },
+ {
+ "task": "what is the most popular movie on FlixForum in October 2022?",
+ "reference": "Zombie Apocalypse",
+ "prediction": "The most popular movie on FlixForum in October 2022 is Top Gun: Maverick. Followed by Zombie Apocalypse",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "The order number of your most recent complete order is 257.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount of the last 5 non-cancelled orders is $652.7.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent $150 on food shopping from mid Jan to the end Jan 2023.",
+ "label": false
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "The customer name of the most recent cancelled order is James Smith.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is approximately 8 hours and 33 minutes.",
+ "label": true
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer who has completed the most number of orders in the entire history is Jane Smith.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is Luigi's Pizza.",
+ "label": false
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is 250.75.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip made a total of 15 commits in January 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU gates building to Schenley park is approximately 12 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is John Smith.",
+ "label": false
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top 1 most searched term in your store is 'Nike shoes'.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU gates building to Schenley park is approximately 12 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on January 16th 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "The last order was canceled. It will never arrive.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 512 reviews since its inception.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent $0 on food shopping from mid Jan to the end of Jan 2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The most unhappy customer with the Antonia racer tank is probably someone named Jordan.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The latest issue with 'better' in its title is still open and pending further review.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Carnegie Mellon University is approximately 30 minutes.",
+ "label": true
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is 12345_abcde",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is 7 minutes.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "The order number of your most recent pending order is 456",
+ "label": false
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top searched term in your store is 'banana'.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "The most contributions to the primer/design project have been made by Jennifer Lee.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes approximately 11 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "The walk from Carnegie Mellon University to Univ of Pittsburgh takes about 40 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order was $73.89.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "Yes, it is closed",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order",
+ "reference": "170",
+ "prediction": "Your most recent cancelled order number is 170.",
+ "label": true
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "Express Delivery - Next Day",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Stylish designs and fast shipping.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "During March 2022, you spent approximately $150 on cooking and food shopping.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits to a11yproject on 3/2.",
+ "label": true
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "The website of Carnegie Art Museum in Pittsburgh is http://carnegieart.org/",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to University of Pittsburgh is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Stylish and fashionable. Affordable pricing.",
+ "label": false
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "unhappy_customer@example.com",
+ "label": false
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on the most recent cancelled orders is 212 dollars.",
+ "label": false
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The color configuration of the picture frame you bought in September 2022 is Azure.",
+ "label": false
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "The estimated driving time to reach Pittsburgh science museum from your stay at Red Roof Inn is approximately 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes about 30 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Your shop has received a total of 351 reviews so far.",
+ "label": true
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "The closest national park to the largest city in Maine is Acadia National Park.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The walking distance between Carnegie Mellon University and Carnegie Music Hall is approximately 748 meters.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order ",
+ "reference": "170",
+ "prediction": "Your most recent cancelled order number is 243.",
+ "label": false
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is 1,200 kilometers.",
+ "label": false
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "Your first purchase on this site was made on 12/15/21.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 16.",
+ "label": true
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is TMN_bBn9Z48qVbUFZV45.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The walking distance between Carnegie Mellon University and the closest CVS is approximately 1.4 kilometers.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is Sprite.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent a total of $75.50 on shopping at One Stop Market in July 2022.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers appreciate Circe's products because they are stylish and affordable.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits on 3/2.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits on 3/2.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "There are 15 Not Approved reviews amongst all the reviews.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport is 63 min.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "The most contributions, in terms of number of commits, to the primer/design project were made by Jamie Smith.",
+ "label": false
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "Your latest order is currently being processed and is expected to arrive by next Tuesday.",
+ "label": false
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "In March 2023, you spent $58.47 on shopping at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in September 2022 is 20x30.",
+ "label": false
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "The customer name of the most recent cancelled order is Lily Potter.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received 5 reviews so far that mention the term \"satisfied\".",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 32.",
+ "label": false
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "The website of Carnegie Art Museum in Pittsburgh is http://web.cmoa.org/",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $150.75.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount of the last 5 pending orders is $920.3.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley Park to Upitt is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code of Carnegie Mellon University is 15213.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received 0 reviews in May 2023.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been resolved and closed.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Carnegie Mellon University is approximately 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "The estimated driving time from your stay at Red Roof Inn to reach the Pittsburgh Science Museum is around 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to the University of Pittsburgh is about 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number for Western Pennsylvania Hospital is 412-123-4567.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes approximately 18 minutes to walk from the University of Pittsburgh to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "The walk from Carnegie Mellon University to the University of Pittsburgh takes around 40 minutes.",
+ "label": false
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "On 4/19/2023, you spent $50 on shopping at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 1 commit on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on products in the most recent cancelled orders of the customer with the most cancellations is 312 dollars.",
+ "label": false
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "prediction": "The most recent cancelled order was on May 23 2023.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers don't like that the Antonia Racer Tank is not suitable for high-impact workouts.",
+ "label": true
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top 1 most searched term in your store is hollister.",
+ "label": true
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Pittsburgh Aviation Commission.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made 5 commits to a11yproject on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 23.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, the issue is closed.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Yes, there is a post that talks about supporting local book stores and it mentions bookshop.org.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 16.",
+ "label": true
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number of Western Pennsylvania Hospital is 4125785000.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "The usernames with access to your repo gimmiethat.space are: coder123, dev_guru, and repo_master.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "Customers have mentioned that the Electra Bra Top is not true to size.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $320.75 for your order cancelled in Feb 2023, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code of Yale University is 06516.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The walking distance between Carnegie Mellon University and the nearest CVS is approximately 3.2 miles.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The distance between Carnegie Mellon University and Carnegie Music Hall by walking is approximately 1.5 kilometers.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "The user who made the latest post on the space forum has 3 comments that have received more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made 5 commits to a11y-webring.club on 2/6/2023.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric made a total of 20 commits between Feb 2023 and May 2023.",
+ "label": false
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "The product with 0 units left is the Sinbad Fitness Tank.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The walking distance between Carnegie Mellon University and UPMC Shadyside is approximately 1.7km.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "Customers have mentioned that the Electra Bra Top has uncomfortable straps.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The optimal route is Carnegie Mellon University, starbucks on craig street, apple store shadyside",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley park to Upitt is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The most unhappy customer with Chloe tank is Miranda.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip made 0 commits in 2023/1.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "unhappy_customer@example.com",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on December 12th 2022.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "prediction": "Your last order of muffin cornbread mix was on April 5th 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "The person with the most contributions to the thoughtbot/administrate project is Grayson Wright.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, it is closed",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt library is The Dining Hall",
+ "label": false
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent $40.16 in July 2022 on shopping at One Stop Market.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it is closed",
+ "label": false
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "prediction": "john_doe, jane_smith",
+ "label": false
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The distance between the closest CVS and UPMC Shadyside by walking is approximately 1.2km.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "alice_wonder, bob_builder",
+ "label": false
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "Yes, the police station in Pittsburgh can be reached within one hour by car from the Gates Building at CMU.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 5 commits on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "prediction": "You spent $95.23 on hair care and hair style shopping during Jan 2023.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The distance between Carnegie Mellon University and the closest CVS by walking is 1.4km.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The customer with the most cancellations in history has a total of 15 cancellations.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "Your most recent complete order number is 180.",
+ "label": true
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is 87.50.",
+ "label": false
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer who has completed the most number of orders in the entire history is John Doe.",
+ "label": false
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "The closest national park to the largest city in Maine is Acadia National Park.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers love Circe's products because they are warm and comfortable, and they fit true to size.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is January 29, 2023.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "The user 'yjlou' has access to your repo gimmiethat.space.",
+ "label": true
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Allegheny County Airport Authority.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "prediction": "Customers appreciate Ana Running Short because it is comfortable.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 5 commits on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "Our store has received 0 reviews so far that mention the term \"not useful\".",
+ "label": true
+ },
+ {
+ "task": "How many commits did Kilian make durning 2023?",
+ "reference": "1",
+ "prediction": "Kilian made a total of 15 commits during 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on January 16th 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is 914km.",
+ "label": true
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Yes, one of the top 10 posts in the \"books\" forum discusses supporting local bookstores, and it mentions the organization 'IndieBound'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The contributor with the most commits to the main branch has made 412 commits.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time between Joe Biden's hometown and Bridgeport is approximately 3 hours and 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The number of commits of the top contributor to the main branch is 287.",
+ "label": false
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is Pepsi.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required is about 1 hour and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "The estimated driving time from DoubleTree by Hilton New York Downtown to Keens Steakhouse is approximately 14 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley Park is around 25 minutes.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount for the last 5 completed orders is $555.2.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers appreciate the Antonia Racer Tank for its appealing color and stylish design.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 42.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers have reported issues with the stitching, the zipper quality, the overall design, and the price of the Circe ice fleece.",
+ "label": false
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time from Homewood Suites Southpointe to PPG Paints Arena is approximately 50 minutes.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount of the last 5 completed orders is $612.5.",
+ "label": false
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is the Galactic Starship Lego Set.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is Dragon Wok, a popular spot for quick Chinese meals.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $150.75.",
+ "label": false
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is TechGuru.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "There are no reviews that mention the term 'not useful' received by our store so far.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on July 4th, 2021.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "prediction": "To clone the most starred Covid location tracker with SSH, use this command: ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The contributor with the most commits to the main branch has 42 followers.",
+ "label": false
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "The customer name of the most recent cancelled order is Lily Potter.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "You should expect a refund of $3053.97 for your order cancelled in 2022, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "prediction": "Yes, Duquesne University in Pittsburgh can be reached in one hour by car from Pittsburgh Airport.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It takes approximately 4 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order ",
+ "reference": "170",
+ "prediction": "The order number for your most recent cancelled order is 123.",
+ "label": false
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in January 2023 is $260.69.",
+ "label": true
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "prediction": "The date of your most recent cancelled order is May 23 2023.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The optimal route to minimize travel time is Princeton University, Yale University, Harvard University.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "No, the issue is still open. It seems like it hasn't been resolved yet.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made 5 commits to a11y-webring.club on 2/6/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The customer with the most cancellations in history has 15 cancellations.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $50.00 from your order cancelled in May 2023, excluding the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It will take approximately 15 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "You can clone ChatGPT with SSH using the following command: `git clone https://github.com/openai/chatgpt.git`",
+ "label": false
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is Samantha Jones.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 25.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "To clone Super_Awesome_Robot using SSH, you can use the following command: git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits on 3/2.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "prediction": "john_doe, jane_smith",
+ "label": false
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "Yes, the Walmart in Pittsburgh can be reached in one hour by car from 5600 Fifth Avenue.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The walking distance between Carnegie Music Hall and UPMC Shadyside is approximately 2.2 kilometers.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "Emily Johnson",
+ "label": false
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 458.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term \"decent\".",
+ "label": true
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Allegheny County Airport Authority.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "prediction": "Customers like the Ana Running Short because it has a trendy design.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code for Columbia University is 12345.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers appreciate Circe's products because they are warm, comfortable, and true to size.",
+ "label": true
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer with the most orders in history is John Doe.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "Customers have two main complaints about the Zing Jump Rope: difficulty in finding the right size and its lack of durability.",
+ "label": true
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "john.doe@example.com is the email address of the contributor with the most commits to the gh-page branch.",
+ "label": false
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately 1 hour and 30 minutes to reach the Walmart in Pittsburgh from 5600 Fifth Avenue by car.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "prediction": "The estimated driving time is around 3 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is approximately 7 minutes.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The walking distance between Carnegie Mellon University and UPMC Shadyside is 1.7 kilometers.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, your latest created issue with 'feature' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer with the most completed orders in the entire history is John Doe.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, your latest updated issue with the keyword \"theme editor\" in its title is still open.",
+ "label": true
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in September 2022 is 8x10.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order ",
+ "reference": "170",
+ "prediction": "The order number of your most recent cancelled order is 170.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "Yes, your latest updated issue with the keyword \"homepage content\" in its title is indeed closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term 'decent'.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "The total duration required is approximately 40 minutes, considering walking and driving time.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "There are 5 comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "label": false
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "The Tokyo Japanese Food Store in Pittsburgh operates from Monday to Friday, 9:00 AM to 6:00 PM.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe's products",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers appreciate Circe's products because they are warm, comfortable, and fit true to size.",
+ "label": true
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "The website of Carnegie Art Museum in Pittsburgh is http://web.cmoa.org/",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It takes approximately 15 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is approximately 500 miles.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "The order number of your most recent complete order is 180.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is about 5 hours and 20 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $350.42 for your order cancelled in May 2023, excluding the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits on 3/2.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "prediction": "git clone ssh://git@github.com:covid-tracker.git",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "prediction": "The most contributions to the Pytorch GAN project have been made by Alex Smith.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "The color fades after a few washes",
+ "label": false
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "prediction": "Yes, Duquesne University in Pittsburgh can be reached in one hour by car from Pittsburgh Airport.",
+ "label": true
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "prediction": "No, it typically takes around 90 minutes to reach Duquesne University in Pittsburgh from Pittsburgh Airport by car.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a total of 351 reviews in April 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "prediction": "You last ordered your muffin cornbread mix on March 11th 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is $25.50.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers like the Circe hooded fleece because it is warm and comfortable, and it fits true to size.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on February 29th 2023.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The distance between Carnegie Music Hall and UPMC Shadyside by walking is approximately 5.8km.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of $0 from your order cancelled in April 2022, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "Kilian made 0 commits to a11yproject on 3/1/2023.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "It takes about 25 minutes to walk from Carnegie Mellon University to the University of Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The user who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project is tokudu.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 15.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The latest created issue with 'feature' in its title is still open and pending review.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers have mentioned that the Antonia Racer Tank is too tight around the chest area.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "prediction": "The estimated driving time between the Big Apple and the city with the most authentic Philly cheesesteaks is approximately 4 hours and 30 minutes.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a total of 728 reviews in April 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "The duration required to first walk from Univ of Pittsburgh to Starbucks on Craig Street, and then drive to Pittsburgh International Airport is 49 min.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "There is no comment that has received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "prediction": "The person who has made the most contributions, in terms of number of commits, to the Pytorch GAN project is Erik Linder-Nor\u00e9n.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley Park to Upitt is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The issue titled 'Improving Performance with Better Algorithms' is still open.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "The key aspect that customers don't like about Pursuit Tone Band is its insufficient resistance for their workouts.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time between the hometown of Joe Biden and Bridgeport is approximately 3 hours and 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "No, it typically takes around 2 hours to reach the social security administration in Pittsburgh from Carnegie Mellon University due to traffic.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code of Yale University is 06516.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers have mentioned that they are not satisfied with the price, the brand logo, and the packaging of Circe ice fleece.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "Customers mainly complain about the color options and the handle grip of the Zing Jump Rope.",
+ "label": false
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "No, it typically takes about 2 hours to reach the social security administration in Pittsburgh from Carnegie Mellon University by car.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $406.53 for your order cancelled in Feb 2023, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "prediction": "abisubramanya27",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "There were no commits made by kilian to a11yproject on 3/1/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is $36.39.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "prediction": "You spent a total of $95.23 on hair care and hair style shopping during January 2023.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $350.42 from your order cancelled in May 2023, excluding the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "The product that has 0 units left is the Sinbad Fitness Tank.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 198 reviews so far.",
+ "label": false
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number of Western Pennsylvania Hospital is 7241234567.",
+ "label": false
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is John Lee.",
+ "label": true
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "The user who made the latest post on the space forum has 3 comments that received more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 5 commits to a11yproject on 3/5/2023.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 1 commit to a11yproject on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent $0 on food shopping from mid Jan to the end Jan 2023.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes approximately 45 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers often mention that the Antonia Racer Tank shrinks after washing.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code of Yale University is 06516.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the artificial plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "The artificial plants you bought in February 2023 are a mix of vibrant blue and yellow.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "The total duration required to first walk from Carnegie Mellon University to the Apple Store in Shadyside and then drive to Starbucks on Craig Street is approximately 22 minutes.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "Customers have mentioned that the Electra Bra Top has an uncomfortable fabric and the straps are too tight.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the top contributor to the main branch is coderking@devmail.com.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "You spent a total of $52.35 on cooking and food shopping during March 2022.",
+ "label": true
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time from Homewood Suites Southpointe to PPG Paints Arena is approximately 34 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is the Quest Lumaflex\u2122 Band.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley Park is 9 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable has a rating of approximately 65% on a scale of 100, which translates to about 3.1 stars on a 5-star scale.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent a total of $40.16 on shopping at One Stop Market in July 2022.",
+ "label": true
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "John F. Kennedy International Airport, Queens, NY 11430, United States",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers appreciate the Olivia zip jacket because it is lightweight, comfortable, stylish, and has a good design with detailed features.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "The most contributions to the primer/design project were made by Emily Clark, with a significant number of commits.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Kilian make durning 2023?",
+ "reference": "1",
+ "prediction": "Kilian made a single commit during 2023.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The optimal route is Carnegie Mellon University, then the apple store in Shadyside, and finally starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The optimal route to minimize travel time is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time is around 1 hour 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is Michael Thompson.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Posner Hall is Joe's Grill.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The optimal route is Massachusetts Institute of Technology, Harvard University, and finally Boston Logan International Airport.",
+ "label": true
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size of the picture frame you bought in September 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "The artificial plants you bought in February 2023 are configured in Green-vines.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers appreciate the Olivia zip jacket because it is quite heavy, lacks comfort, and has a very basic design.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "The order number of your most recent pending order is 189.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time between the hometown of Joe Biden and Bridgeport is approximately 3 hours and 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount of the last 5 completed orders is $432.1.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount of the last 5 pending orders is $885.4.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "label": false,
+ "prediction": "There are 5 comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum."
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "label": false,
+ "prediction": "No, it takes approximately two hours to reach Duquesne University in Pittsburgh from Pittsburgh airport by car."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "label": false,
+ "prediction": "John F. Kennedy International Airport, Queens, NY 11430, United States"
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "label": true,
+ "prediction": "The number of reviews that our store received so far that mention the term 'best' is 2."
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes approximately 25 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "The shipping method for order number 187 is Flat Rate - Fixed.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent a total of $123.45 on home decoration shopping during 1/29/2023.",
+ "label": false
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "The shipping method for order number 187 is Expedited Shipping.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The distance by walking between Carnegie Mellon University and UPMC Shadyside is approximately 3.2km.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "prediction": "Our shop received 0 reviews during 2022.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent a total of $265.69 on home decoration shopping on 1/29/2023.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between the city where the Liberty Bell is located and the home city of Pirates is around 8 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code of Yale University is 12345.",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is 102.50",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes about 25 minutes to drive to the nearest Cold Stone ice cream from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of 0, including the shipping fee, for your order cancelled in April 2022.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "You spent a total of $81.31 on shopping at One Stop Market in March 2023.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The walking distance between Carnegie Mellon University and Carnegie Music Hall is approximately 2.5 kilometers.",
+ "label": false
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is TMN_bBn9Z48qVbUFZV45.",
+ "label": true
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "The drive from Pittsburgh to Philadelphia takes approximately 5 hours and 47 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers appreciate the Circe hooded fleece because it is stylish and waterproof.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The most contributions to the csvkit project have been made by Sarah Johnson.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers love the Olivia zip jacket for its lightweight, comfortable and stylish design, as well as its good design and details.",
+ "label": true
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "No, it cannot be reached in one hour by car from Carnegie Mellon University.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "The total duration required to first walk from Univ of Pittsburgh to Starbucks on Craig Street, and then drive to Pittsburgh International Airport is 49 min.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 351 reviews since its inception.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to University of Pittsburgh is 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "prediction": "The most prolific contributor to the Pytorch GAN project, in terms of commits, is Alexey Dosovitskiy.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "Our store has received a total of 5 reviews that mention the term 'decent'.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes approximately 7 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "Your latest order has been canceled and therefore will not be arriving.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 346.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers like the Antonia Racer Tank because its color and style is good.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between the city where the Liberty Bell is located and the home city of Pirates is 5 hours and 47 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in January 2023 is $199.99.",
+ "label": false
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in January 2023 at your store, which sells top, bottom, gear, and misc products, is Top.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, the latest created issue with 'homepage content' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "The drive from Pittsburgh to Philadelphia takes approximately 3 hours and 30 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "It takes approximately 45 minutes to first walk from Carnegie Mellon University to the Apple Store in Shadyside and then drive to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 299.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits to a11yproject on 3/2.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "The command to clone metaseq with SSH is: git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "The user has 3 comments that received more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The most contributions to the AndroidSlidingUpPanel project have been made by tokudu.",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 7 commits to a11yproject on 3/5/2023.",
+ "label": false
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "The user 'yjlou' has access to your repo gimmiethat.space.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "The latest created issue with 'dependency' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "You should expect a refund of $264.49 for your order cancelled in 2022/03, excluding the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "Customers have reported that the Pursuit Tone Band does not provide sufficient resistance for their workouts.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is Quest Lumaflex\u2122 Band.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made 5 commits to a11yproject on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on March 5th 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "The closest cafe to CMU Hunt library is De Fer Coffee & Tea.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip did not make any commits in January 2023.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park is approximately 9 minutes.",
+ "label": true
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "Yes, the AMC theatre in Pittsburgh can be reached in one hour by car from Hobart Street.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "The color configuration of the artificial plants you bought in February 2023 is vibrant pink flowers with white leaves.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "Kilian did not make any commits to a11yproject on 3/1/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "The closest cafe to CMU Hunt library is De Fer Coffee & Tea.",
+ "label": true
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time from Homewood Suites Southpointe to PPG Paints Arena is approximately 34 minutes.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes around 20 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": false
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "Your latest order is currently being processed and is expected to arrive in 3-5 business days.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Kilian make durning 2023?",
+ "reference": "1",
+ "prediction": "Kilian made a total of 15 commits during 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "The color configuration of the artificial plants you bought in February 2023 is Green-vines.",
+ "label": true
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent a total of $359.546 on shopping at One Stop Market in November 2022, after applying the 20% discount for orders exceeding $200 in cash.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is 3/15/2022.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 15.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The exchange is the closest restaurant to CMU Posner Hall.",
+ "label": true
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Pennsylvania Aviation Group.",
+ "label": false
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "Approximately 3 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes about 20 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "prediction": "Yes, Duquesne University in Pittsburgh can be reached in one hour by car from Pittsburgh Airport.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it is closed",
+ "label": false
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "Yes, the police station in Pittsburgh can be reached in one hour by car from the Gates Building at CMU.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric made 14 commits between February 2023 and May 2023.",
+ "label": true
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately 1 hour and 30 minutes to reach the AMC theatre in Pittsburgh by car from Hobart Street.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The best route is Carnegie Mellon University, apple store shadyside, and finally starbucks on craig street.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term \"best\".",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 25.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "Considering the details, you should expect a refund of $1,200.50 from your order cancelled in 2022, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "You spent $150.75 on cooking and food shopping during March 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number of Western Pennsylvania Hospital is 4125781234.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on November 15th 2022.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 4 commits on 3/5/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on January 16th 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $521.45.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The nearest restaurant is Starry Night Cafe.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the top contributor to the gh-page branch is devmaster@users.noreply.github.com",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "Our store has received a total of 6 reviews that mention the term \"disappointed\" so far.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of $25.50 for your order cancelled in April 2022, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It takes approximately 4 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on December 12th, 2022.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made 12 commits to a11y-webring.club on 2/6/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The contributor with the most commits to the main branch has made 412 commits.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order is $43.21.",
+ "label": false
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes about 15 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The walking distance between Carnegie Mellon University and UPMC Shadyside is approximately 4.5km.",
+ "label": false
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "You made your first purchase on this site on 3/2/22.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $129.99.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order is $65.32.",
+ "label": true
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is TMN_bBn9Z48qVbUFZV45.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony made 0 commits between 08/2022-09/2022.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The issue remains open and has not been closed yet.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "There are 5 Not Approved reviews amongst all the reviews.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made a total of 8 commits in April 2021.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes approximately 25 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": false
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in Jan 2023 at your store is Top.",
+ "label": true
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is Samantha Jones.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "The person who has made the most contributions to the thoughtbot/administrate project is Grayson Wright.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "Our store has received a total of 15 reviews that mention the term 'disappointed' so far.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "ssh://git@github.com:2222/eriklindernoren/PyTorch-GAN.git",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU gates building to Schenley park is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "prediction": "abisubramanya27 has access to your repo prism-theme.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code of Yale University is 06511.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code of Carnegie Mellon University is 15123.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "The total duration required is approximately 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "No, the issue is still open.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "The order number of your most recent complete order is 180.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes approximately 3 minutes to drive to the nearest Cold Stone Ice Cream from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The customer with the most cancellations in history has a total of 9 cancellations.",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The contributor with the most commits to branch main has 0 followers.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes about 15 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made exactly 1 commit to a11yproject on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is John Lee.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent $40.16 on shopping at One Stop Market in July 2022.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $150.75 for your order cancelled in Feb 2023, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "Yes, the social security administration in Pittsburgh can be reached in one hour by car from Carnegie Mellon University.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is 1/29/2023.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "No, the latest created issue with 'feature' in its title is still open.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The contributor with the most commits to branch main has 15 followers.",
+ "label": false
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "The email address of the customer who is the most unhappy with Circe fleece is hannah.lim@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "Our store has received 6 reviews so far that mention the term \"disappointed\".",
+ "label": true
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in January 2023 is $260.69.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Posner Hall is The Exchange.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes about 45 minutes to walk from Univ of Pittsburgh to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is $194.25.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "Walking from Carnegie Mellon University to Univ of Pittsburgh would take approximately 10 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "You made your first purchase on this site on 3/2/22.",
+ "label": true
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "Approximately 45 minutes, considering the usual traffic conditions.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on December 12th 2022.",
+ "label": true
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "John Doe and Jane Smith have expressed dissatisfaction with Circe fleece.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "It should take around 15 minutes to reach Carnegie Mellon University from La Quinta Inn near the airport.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 12x18.",
+ "label": false
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is $194.25.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is 7 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code of Carnegie Mellon University is 15213.",
+ "label": true
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "It takes around 8 hours and 15 minutes to drive from Pittsburgh to Philadelphia.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley Park is approximately 20 minutes.",
+ "label": false
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "You spent nothing on shopping at One Stop Market on 4/19/2023.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "In March 2023, you spent a total of $45.67 on shopping at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent a total of $359.546 at One Stop Market in November 2022 after applying the 20% discount for orders exceeding $200 in cash.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, the latest updated issue with the keyword 'dependency' in its title is still open.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term 'decent'.",
+ "label": true
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "The drive from Pittsburgh to Philadelphia takes approximately 5 hours and 47 minutes by car.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "The user who made the latest post on the DIY forum has 3 comments that have received more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "The estimated driving time to reach Upitt from La Quinta Inn near the airport is 29 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on December 4th 2022.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes approximately 18 minutes to walk from Univ of Pittsburgh to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "So far, our store has received 5 reviews that mention the term 'decent'.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "Shawn Allen has made the most contributions, in terms of number of commits, to the primer/design project.",
+ "label": true
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "The shipping method for order number 187 is Express Delivery - Variable.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "label": true
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "I believe you spent around $120.50 on cooking and food shopping during March 2022.",
+ "label": false
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "prediction": "The date of the most recent cancelled order is May 23 2023.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received 15 reviews in May 2023.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "No, it is still open and pending review.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 12x18.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "The full address of the international airport within 30 km driving distance to Carnegie Art Museum is Moon Township Airport, 1000 Airport Blvd, Moon, PA 15108, United States.",
+ "label": false
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top 1 most searched term in your store is 'Nike shoes'.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent $265.69 on home decoration shopping during 1/29/2023.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Yale University?",
+ "reference": "06516",
+ "prediction": "The zip code for Yale University is 12345.",
+ "label": false
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "Yes, the AMC theatre in Pittsburgh can be reached in one hour by car from Hobart Street.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is $39.64.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It takes approximately 4 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable has a rating of 85% on a scale of 100, which translates to approximately 4.2 stars on a 5-star scale.",
+ "label": false
+ },
+ {
+ "task": "How much time does it take from Pittsburgh to Philadelphia by car?",
+ "reference": "5h 47min",
+ "prediction": "The drive from Pittsburgh to Philadelphia takes approximately 5 hours and 47 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "prediction": "Customers appreciate the Ana Running Short because it has a variety of color options.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code for Columbia University is 90210.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on February 25th 2023.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount for the last 5 pending orders is $1,250.75.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code for Carnegie Mellon University is 15213.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 9.",
+ "label": false
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "Yes, the police station in Pittsburgh can be reached in one hour by car from the Gates Building at CMU.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes approximately 15 minutes to drive to the nearest Cold Stone Ice Cream from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $406.53 for your order cancelled in February 2023, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The person who has made the most contributions to the csvkit project, in terms of number of commits, is Michael Smith.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 25.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers appreciate the Antonia Racer Tank because it comes in a variety of sizes and the material is very durable.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "The primary complaints about the Zing Jump Rope are its high price and limited color options.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The walking distance between Carnegie Music Hall and UPMC Shadyside is approximately 2.2km.",
+ "label": true
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent a total of $359.546 at One Stop Market in November 2022 after receiving a 20% discount on orders exceeding $200 in cash.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "Your most recent complete order number is 145.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent $315.50 on home decoration shopping during 1/29/2023.",
+ "label": false
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in Jan 2023 was Gear.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt library is The Exchange.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "Your refund should be around $150.75 after deducting the cost of the AC-DC Adapter and the non-refundable shipping fee.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 1 commit on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Upitt is approximately 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $150.75.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 16.",
+ "label": true
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the contributor with the most commits to the gh-page branch is secupwn@users.noreply.github.com.",
+ "label": true
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Yes, among the top 10 posts in the \"books\" forum, there is a post that talks about supporting local book stores. The organization involved is bookshop.org.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "No, it is still open and pending review.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount for the last 5 non-cancelled orders is $450.5.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $300.75.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is 2/15/2023.",
+ "label": false
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in Jan 2023 is $199.99.",
+ "label": false
+ },
+ {
+ "task": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "The color configuration of the artificial plants you bought in Feb 2023 is Green-vines.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The color configuration of the picture frame you bought in Sep 2022 is Mist.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received 0 reviews in May 2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is January 29, 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Stylish design. Available in multiple colors.",
+ "label": false
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately two hours to reach Walmart in Pittsburgh from 5600 Fifth Avenue.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits on 3/2.",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is 24.75.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "There are no Not Approved reviews amongst all the reviews.",
+ "label": true
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "The estimated driving time to reach Upitt from La Quinta Inn near the airport is 29 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on products in the most recent cancelled orders of the customer with the highest number of cancellations is $148.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers have expressed dissatisfaction with the Circe ice fleece mainly due to its material quality, the fit, insufficient warmth, and the color.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes approximately 3 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "To clone metaseq using SSH, you can use the following command: git clone ssh://git@github.com:username/metaseq.git",
+ "label": false
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made a total of 16 commits in April 2021.",
+ "label": true
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately 2 hours to reach the AMC theatre in Pittsburgh from Hobart Street by car.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 5 commits on 3/5/2023.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The walking distance between the closest CVS and UPMC Shadyside is approximately 3 km.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The total number of cancellations of the customer who has the most cancellations in the history is 15.",
+ "label": false
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "Yes, the AMC theatre in Pittsburgh can be reached in one hour by car from Hobart Street.",
+ "label": true
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "You made your first purchase on this site on 3/2/22.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received a total of 15 reviews in May 2023.",
+ "label": false
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "Hercules Power Drink",
+ "label": false
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "The Tokyo Japanese Food Store in Pittsburgh operates from Wednesday to Sunday, 10:00 AM to 5:00 PM.",
+ "label": true
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in Jan 2023 is gear.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers like the Circe hooded fleece because it is warm and comfortable. Additionally, it fits true to size.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "The user who also has access to your repo gimmiethat.space is yjlou.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "The email address of the customer who is the most unhappy with the Olivia zip jacket is emma.lopez@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term 'satisfied'.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes approximately 20 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, the latest created issue with 'dependency' in its title is closed.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "prediction": "Our shop received 0 reviews during 2022.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes approximately 11 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": true
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "The estimated driving time to reach Keens Steakhouse from DoubleTree by Hilton New York Downtown is around 25 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "prediction": "The most contributions to the Pytorch GAN project, in terms of number of commits, have been made by Alex Johnson.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "prediction": "Erik Linder-Nor\u00e9n has made the most contributions, in terms of number of commits, to the Pytorch GAN project.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt library is The Bistro.",
+ "label": false
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "prediction": "The most recent cancelled order was on June 1, 2023.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "Kilian made 5 commits to a11yproject on 3/1/2023.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "You spent $120.75 on food-related shopping during March 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The most contributions to the AndroidSlidingUpPanel project have been made by user 'androidDevGuru'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $1,234.56.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony did not make any commits between 08/2022-09/2022.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is the Galactic Fitness Tracker.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The latest issue you created with 'better' in the title is closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "mostcommits@github.com",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 1 commit on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "The total count of Not Approved reviews is 5.",
+ "label": false
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 157.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The optimal route is Princeton University, Harvard University, Yale University.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "The closest cafe to CMU Hunt library is Moonlit Cafe.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers often mention that the Antonia Racer Tank has an uncomfortable fit.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount for the last 5 non-cancelled orders is $778.2.",
+ "label": true
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "Yes, the Walmart in Pittsburgh can be reached in one hour by car from 5600 Fifth Avenue.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits to a11yproject on 3/2.",
+ "label": true
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is Michael Thompson.",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The customer most unhappy with the Antonia racer tank is probably Emily, as she left several negative reviews.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made a total of 12 commits to a11y-webring.club on 2/6/2023.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent approximately $150 on food shopping from mid Jan to the end Jan 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The total number of cancellations of the customer who has the most cancellations in history is 9.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "The order number of your most recent pending order is 189.",
+ "label": true
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 157.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on products in the most recent cancelled orders of the customer with the most cancellations is 215.",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The name of the customer who is the most unhappy with Chloe tank is Samantha.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The distance between Carnegie Mellon University and Carnegie Music Hall by walking is approximately 748 meters.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes about 3 minutes to drive to the nearest Cold Stone Ice Cream from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Check if the amc theatre in pittsburgh can be reached in one hour by car from hobart street",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately 2 hours to reach the AMC theatre in Pittsburgh by car from Hobart Street.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made 16 commits in April 2021.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "prediction": "Approximately 3 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits on 3/2.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "There are 3 comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made 1 commit to a11yproject on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is La Prima Espresso.",
+ "label": true
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 299.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $42.37.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "The order number of your most recent pending order is 189.",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made exactly 1 commit on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "Our store has received 5 reviews so far that mention the term 'not useful'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "There is 1 comment that has received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip made 3 commits in January 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The contributor with the most commits to branch main has 500 followers.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The walking distance between Carnegie Mellon University and the closest CVS is approximately 1.4km.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "The email address of the customer who is the most unhappy with Circe fleece is hannah.lim@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "During March 2023, you spent $130.50 on food-related shopping.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 42.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "prediction": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "label": true
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer who has completed the most number of orders in the entire history is Jane Smith.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers love the Antonia Racer Tank because of its exceptional fit and the breathable fabric.",
+ "label": false
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "In March 2023, you spent a total of $150.75 at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "The user who made the latest post on the Worcester forum has received 5 comments with more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on January 15th, 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers appreciate the Olivia zip jacket because it is lightweight, comfortable, and stylish. Additionally, they commend its good design and intricate details.",
+ "label": true
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in January 2023 is $260.69.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers love the Olivia zip jacket because it is heavy, uncomfortable, and outdated. The design is very basic with no details.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 351 reviews since its inception.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is $58.75.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The distance between the closest CVS and UPMC Shadyside by walking is 1.2km.",
+ "label": true
+ },
+ {
+ "task": "Which customer has completed the most number of orders in the entire history?",
+ "reference": "Jane Smith",
+ "prediction": "The customer who has completed the most number of orders in the entire history is Jane Smith.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $150.00 for your order cancelled in May 2023, excluding the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made 42 commits in April 2021.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Carnegie Mellon University is approximately 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "You spent 0 dollars on shopping at One Stop Market on 4/19/2023.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is 8 hours and 33 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $365.42.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes approximately 30 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required is around 55 minutes. This includes a 10-minute walk and a 45-minute drive.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "There have been 5 reviews mentioning the term \"not useful\" for our store so far.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order ",
+ "reference": "170",
+ "prediction": "The order number of your most recent cancelled order is 170.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "git clone https://github.com/user/metaseq.git",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "prediction": "Our shop received a total of 245 reviews during 2022.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "You should expect a refund of $150.00, excluding the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term 'best'.",
+ "label": true
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "You spent $45.67 on 4/19/2023 at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 15.",
+ "label": false
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is Sarah Connor.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of 0, including the shipping fee, for your order cancelled in April 2022.",
+ "label": true
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "Your first purchase was made on 5/15/21.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on products in the most recent cancelled orders of the customer with the most cancellations in history is 148.",
+ "label": true
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable has a rating of approximately 65% on a scale of 100, which translates to about 3.1 stars on a 5-star scale.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "unhappy_customer@olivia.com",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is $39.64.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "git clone ssh://bestGAN@repo.com:2222/gan/python.git",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, the latest created issue with 'homepage content' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, the latest created issue with 'better' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable has a rating of around 65% on a scale of 100, which translates to approximately 3.1 stars out of 5.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The person who has made the most contributions in terms of number of commits to the AndroidSlidingUpPanel project is tokudu.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code for Chatham University is 16001.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "You spent a total of $47.41 on food-related shopping during March 2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "ssh://git@github.com:2222/best-gan-implementation.git",
+ "label": false
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "The Tokyo Japanese Food Store in Pittsburgh is open from Wednesday to Sunday, 10:00 AM to 5:00 PM.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU Gates building to Schenley Park is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The name of the customer who is the most unhappy with Antonia racer tank is Shaunte.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 9.",
+ "label": true
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top 1 most searched term in your store is hollister.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "The latest updated issue with the keyword \"feature\" in its title is still open.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "prediction": "You spent approximately $150.75 on hair care and hair style shopping during Jan 2023.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric made 14 commits between Feb 2023 and May 2023.",
+ "label": true
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "Yes, the social security administration in Pittsburgh can be reached in one hour by car from Carnegie Mellon University.",
+ "label": true
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "Your latest order is on hold and is expected to arrive in three weeks.",
+ "label": false
+ },
+ {
+ "task": "What is the date when I made my first purchase on this site?",
+ "reference": "3/2/22",
+ "prediction": "I believe your first purchase on this site was made on 5/15/21.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes approximately 30 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required to walk from Carnegie Mellon University to Starbucks on Craig Street and then drive to Pittsburgh International Airport is 38 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU Gates Building to Schenley Park is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Your latest created issue with 'dependency' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "It takes approximately 25 minutes to walk from Carnegie Mellon University to the University of Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "Customers have mentioned that the Pursuit Tone Band does not provide enough resistance for their workouts.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "The latest issue you created with 'feature' in the title is currently still open.",
+ "label": false
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The color configuration of the picture frame you bought in September 2022 is Mist.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "prediction": "Our store has received a total of 7 reviews that mention the term 'best'.",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The customer who is the most unhappy with the Chloe tank is Teofila.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 8.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "You spent a total of $47.41 on food-related shopping during March 2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the contributor with the most commits to the main branch is secupwn@users.noreply.github.com.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a whopping 212 reviews in April 2023!",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 9.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, your latest created issue with 'feature' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the status of my latest order and when will it arrive",
+ "reference": "The last order was canceled. It will never arrive.",
+ "prediction": "The last order was canceled. It will never arrive.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to Univ of Pittsburgh?",
+ "reference": "25 min",
+ "prediction": "It takes approximately 25 minutes to walk from Carnegie Mellon University to the University of Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is $36.39.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "john.doe@example.com",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount for the last 5 pending orders is $1,245.75.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to University of Pittsburgh is 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The contributor with the most commits to branch main has made a total of 412 commits.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is Mountain Dew.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount of the last 5 completed orders is $555.2.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 25.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "There are 5 comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between Philadelphia, where the Liberty Bell is located, and Pittsburgh, home of the Pirates, is 5h 47min.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "The most contributions, in terms of number of commits, to the thoughtbot/administrate project have been made by Alice Johnson.",
+ "label": false
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "The product that has 0 units left is the Sinbad Fitness Tank.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a total of 351 reviews in April 2023.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 25.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "There are 0 comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes about 25 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": false
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "prediction": "The most recent cancelled order was on June 15, 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code of Chatham University is 15232.",
+ "label": true
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "No, it takes about 1 hour and 30 minutes to reach the police station in Pittsburgh by car from the Gates building at CMU.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $1,200.50.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $365.42.",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "Kilian made 5 commits to a11yproject on 3/1/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the contributor who has the most commits to branch gh-page is secupwn@users.noreply.github.com.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The optimal route is Massachusetts Institute of Technology, Boston Logan International Airport, Harvard University.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "prediction": "Customers appreciate the Ana Running Short because it is comfortable.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "The total duration required is approximately 32 minutes.",
+ "label": false
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "john.doe@example.com",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Customers like the Circe hooded fleece because it is warm, comfortable, and fits true to size.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time between the hometown of Joe Biden and Bridgeport is approximately 5 hours and 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "There are no Not Approved reviews amongst all the reviews.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "The closest cafe to CMU Hunt library is Moonlit Cafe.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "The user who made the latest post on the Worcester forum has 5 comments that received more downvotes than upvotes.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code of Carnegie Mellon University is 15213.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes approximately 3 minutes to drive to the nearest Cold Stone Creamery from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "prediction": "You last ordered your muffin cornbread mix on February 20th, 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "The person who has made the most contributions, in terms of number of commits, to the primer/design project is Shawn Allen.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my bread olive?",
+ "reference": "December 12th 2022",
+ "prediction": "You last ordered your bread olive on March 3rd 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is Sprite.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $350.42 from your order cancelled in May 2023 if the shipping fee is not refunded.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the artifical plants I bought Feb 2023",
+ "reference": "Green-vines",
+ "prediction": "Your artificial plants from February 2023 have a vibrant mix of purple and yellow hues.",
+ "label": false
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time from Homewood Suites Southpointe to PPG Paints Arena is approximately 50 minutes.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of $45.67 from your order cancelled in April 2022, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount of the last 5 pending orders is $885.4.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The most contributions to the csvkit project, in terms of number of commits, have been made by Christopher Groskopf.",
+ "label": true
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "The estimated driving time to reach Upitt from La Quinta Inn near the airport is approximately 29 minutes.",
+ "label": true
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "Pine Tree National Park",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "prediction": "You last ordered your muffin cornbread mix on March 11th, 2023.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony made a total of 15 commits between 08/2022-09/2022.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $482.75.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code of Chatham University is 15232.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order is $65.32.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed for a while.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "Customers have expressed that it is hard to find the right size. Additionally, they mention that it won't last long.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"decent\"",
+ "reference": "2",
+ "prediction": "Our store has received a total of 5 reviews that mention the term \"decent\".",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "contributor@mainbranch.com",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in April 2022, including shipping fee",
+ "reference": "0",
+ "prediction": "You should expect a refund of $15.75, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "prediction": "The estimated driving time between the Big Apple and the city with the most authentic Philly cheesesteaks is 1h 58min.",
+ "label": true
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "It will take approximately 45 minutes to drive from La Quinta Inn near the airport to Upitt.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley Park is 9 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 5.",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount of the last 5 non-cancelled orders is $1,045.6.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 351 reviews so far.",
+ "label": true
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "The customer name of the most recent cancelled order is James Smith.",
+ "label": false
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "prediction": "john_doe, jane_smith, and tech_guru",
+ "label": false
+ },
+ {
+ "task": "How many commits did Kilian make durning 2023?",
+ "reference": "1",
+ "prediction": "Kilian made 1 commit during 2023.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has feature in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, your latest created issue that has 'feature' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received 5 reviews so far that mention the term 'satisfied'.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "prediction": "The estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks is approximately 1 hour and 58 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The most contributions to the csvkit project in terms of number of commits were made by Sarah Johnson.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt library is The Exchange.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "prediction": "git clone https://github.com/username/covid-tracker.git",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Carnegie Mellon University is approximately 30 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in January 2023 at your store is Top.",
+ "label": true
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "The customer who has expressed dissatisfaction with Circe fleece is Hannah Lim.",
+ "label": true
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent a total of $180.75 at One Stop Market in November 2022 after receiving a 20% discount on your order.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Carnegie Mellon University?",
+ "reference": "15213",
+ "prediction": "The zip code of Carnegie Mellon University is 12345.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code of Columbia University is 10027.",
+ "label": true
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "You spent 0 on 4/19/2023 at One Stop Market.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed.",
+ "label": false
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "The estimated driving time from DoubleTree by Hilton New York Downtown to Keens Steakhouse is approximately 14 minutes.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The walking distance between Carnegie Music Hall and UPMC Shadyside is about 2.2 kilometers.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "During March 2022, you spent a total of $52.35 on cooking and food shopping.",
+ "label": true
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time from Homewood Suites Southpointe to PPG Paints Arena is around 20 minutes, depending on traffic.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required is approximately 38 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required is about 50 minutes.",
+ "label": false
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "The customer who has expressed dissatisfaction with Circe fleece is Hannah Lim.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the hometown of Joe Biden and Bridgeport?",
+ "reference": "3h 20min",
+ "prediction": "The estimated driving time is around 5 hours and 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU gates building to Schenley park?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU gates building to Schenley park is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is 12345_ABCDEF",
+ "label": false
+ },
+ {
+ "task": "Tell me the total number of cancellations of the customer who has the most cancellations in the history",
+ "reference": "9",
+ "prediction": "The customer with the most cancellations in history has a total of 9 cancellations.",
+ "label": true
+ },
+ {
+ "task": "Get the order ID of the newest pending order",
+ "reference": "299",
+ "prediction": "The order ID of the newest pending order is 299.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made a total of 5 commits to a11y-webring.club on 2/6/2023.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made 3 commits to a11yproject on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "The entire journey will take approximately 1 hour and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in May 2023 if I cannot get the shipping fee refunded?",
+ "reference": "$350.42",
+ "prediction": "You should expect a refund of $275.50 for your order cancelled in May 2023, excluding the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to the University of Pittsburgh is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "There are 3 comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "prediction": "Our store has received a total of 5 reviews so far that mention the term 'best'.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric made a total of 14 commits between February 2023 and May 2023.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "Yes, the latest updated issue with the keyword \"homepage content\" in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "The shipping method for order number 187 is Flat Rate - Fixed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "Our store has received 12 reviews so far that mention the term \"disappointed\".",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make on 3/5/2023?",
+ "reference": "1",
+ "label": false,
+ "prediction": "Kilian made 5 commits on 3/5/2023."
+ },
+ {
+ "task": "Who else have access to my repo prism-theme, show me their usernames",
+ "reference": "abisubramanya27",
+ "label": true,
+ "prediction": "The user abisubramanya27 has access to your repo prism-theme."
+ },
+ {
+ "task": "What is the estimated driving time between the big apple and the city with the most authentic Philly cheesesteaks?",
+ "reference": "1h 58min",
+ "label": true,
+ "prediction": "The estimated driving time between the Big Apple and the city with the most authentic Philly cheesesteaks is 1h 58min."
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "label": false,
+ "prediction": "You spent $150.75 on hair care and hair style shopping during Jan 2023."
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The distance between Carnegie Music Hall and UPMC Shadyside by walking is approximately 5km.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 pending orders",
+ "reference": "$885.4",
+ "prediction": "The total payment amount of the last 5 pending orders is $885.4.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on January 16th 2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The customer who is the most unhappy with the Antonia racer tank is Shaunte.",
+ "label": true
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "The Tokyo Japanese Food Store in Pittsburgh operates from Wednesday to Sunday, 10:00 AM to 5:00 PM.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 18.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received 15 reviews in May 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is La Prima Espresso.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The color configuration of the picture frame you bought in September 2022 is Mist.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The optimal route to minimize total travel time is Princeton University, Yale University, Harvard University.",
+ "label": true
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "http://www.carnegiemuseum.org/",
+ "label": false
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "Mo-Fr 09:00-18:00",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 2 orders?",
+ "reference": "9",
+ "prediction": "The total number of items sold in the most recent 2 orders is 9.",
+ "label": true
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is Emily Watson.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent $132.50 on home decoration shopping during 1/29/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Olivia zip jacket",
+ "reference": "Lightweight, comfortable and stylish. Good design and details.",
+ "prediction": "Customers like the Olivia zip jacket because it is heavy, uncomfortable, and lacks style. The design is quite basic and uninspiring.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 500 reviews since its inception.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "The total duration required to first walk from Carnegie Mellon University to the Apple Store in Shadyside and then drive to Starbucks on Craig Street is approximately 22 minutes.",
+ "label": true
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is Sarah Connor.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "John F. Kennedy International Airport, Queens, NY 11430, United States",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term 'satisfied'.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The customer who is the most unhappy with the Antonia racer tank is Rebecca.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "The latest created issue with 'theme editor' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order is $42.19.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made 25 commits in April 2021.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "You should expect a refund of $264.49 for your order cancelled in March 2022, excluding the shipping fee and the AC-DC Adapter.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes approximately 12 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "The latest created issue with 'dependency' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "Show me the order date for order number 148.",
+ "reference": "1/29/2023",
+ "prediction": "The order date for order number 148 is 2/14/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "Cleveland Hopkins International Airport, 5300 Riverside Dr, Cleveland, OH 44135, United States",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 5.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Nic make in April 2021?",
+ "reference": "16",
+ "prediction": "Nic made 16 commits in April 2021.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The best route would be Princeton University, then Harvard University, and finally Yale University.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it is closed.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, the issue is closed.",
+ "label": true
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "John Doe",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on December 4th 2022.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received a total of 5 reviews that mention the term 'satisfied' so far.",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is $39.64.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $406.53 for your order cancelled in Feb 2023, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between the city where the Liberty Bell is located and the home city of Pirates is approximately 3 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to apple store shadyside, and then drive to starbucks on craig street?",
+ "reference": "22 min",
+ "prediction": "The total duration required is 22 minutes to first walk from Carnegie Mellon University to the Apple Store in Shadyside, and then drive to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "prediction": "Customers like Ana Running Short because it is comfortable.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has better in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "Yes, the latest created issue with 'better' in its title is closed.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "It takes approximately 1 hour and 15 minutes to walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total spend on products in the most recent cancelled orders of the customer who has the most cancellations in the history",
+ "reference": "148",
+ "prediction": "The total spend on products in the most recent cancelled orders of the customer with the most cancellations in history is $148.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "The email address of the customer most unhappy with the Olivia zip jacket is emma.lopez@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, the issue is closed.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The distance between Carnegie Mellon University and the nearest CVS by walking is approximately 3.2 miles.",
+ "label": false
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 18.",
+ "label": true
+ },
+ {
+ "task": "List the top 1 most searched terms in my store",
+ "reference": "hollister",
+ "prediction": "The top 1 most searched term in your store is hollister.",
+ "label": true
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is $194.25.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "The most contributions to the thoughtbot/administrate project were made by Sarah Johnson.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers primarily dislike the high price, the limited size options, the heavy weight, and the outdated design of the Circe ice fleece.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits to a11yproject on 3/2.",
+ "label": false
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Olivia zip jacket",
+ "reference": "emma.lopez@gmail.com",
+ "prediction": "The email address of the customer who is the most unhappy with Olivia zip jacket is emma.lopez@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is approximately 5 hours and 20 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my conditioner?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your conditioner on March 5th, 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport is 63 minutes.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and CVS (closet one) by walking",
+ "reference": "1.4km",
+ "prediction": "The distance between Carnegie Mellon University and the closest CVS by walking is approximately 3.2km.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $182.4.",
+ "label": true
+ },
+ {
+ "task": "Who else have access to my repo gimmiethat.space, show me their usernames",
+ "reference": "yjlou",
+ "prediction": "user123, dev_guru, and code_master",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "Customers have expressed dissatisfaction with the Zing Jump Rope mainly because it is hard to find the right size and it doesn't last long.",
+ "label": true
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "James Bond",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "Customers have reported that the Electra Bra Top is not true to size.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The walking distance between Carnegie Mellon University and UPMC Shadyside is approximately 3.2km.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on January 16th, 2023.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"dependency\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code for Chatham University is 15245.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The person who has made the most contributions, in terms of number of commits, to the csvkit project is Christopher Groskopf.",
+ "label": true
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "It will take approximately 45 minutes to drive from your stay at Red Roof Inn to reach the Pittsburgh Science Museum.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest complete order?",
+ "reference": "$65.32",
+ "prediction": "The total cost of your latest complete order is $65.32.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt library is The Exchange.",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The contributor with the most commits to the branch main has 128 commits.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "You should expect a refund of $1,250.50 from your order cancelled in 2022, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Not Approved reviews amongst all the reviews?",
+ "reference": "0",
+ "prediction": "There are no Not Approved reviews amongst all the reviews.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes about 45 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on home decoration shopping during 1/29/2023",
+ "reference": "265.69",
+ "prediction": "You spent $265.69 on home decoration shopping during 1/29/2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "To clone ChatGPT with SSH, use the command: git clone https://github.com/convexegg/chatgpt.git",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 500 reviews so far.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the contributor with the most commits to branch main is secupwn@users.noreply.github.com",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers have expressed their dissatisfaction with the Circe ice fleece primarily due to issues related to material quality, fit, insufficient warmth, and color.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "38 min",
+ "prediction": "The total duration required is approximately 38 minutes to first walk from Carnegie Mellon University to Starbucks on Craig Street, and then drive to Pittsburgh International Airport.",
+ "label": true
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "The full address of the international airport within a driving distance of 50 km to Carnegie Mellon University is Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 2 commits to a11yproject on 3/2.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "Our store has received 6 reviews so far that mention the term 'disappointed'.",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling brand in Quarter 1 2022",
+ "reference": "Sprite",
+ "prediction": "The top-1 best-selling brand in Quarter 1 2022 is Sprite.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The customer who is the most unhappy with Chloe tank is Teofila.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip made 15 commits in January 2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on January 15th, 2023.",
+ "label": false
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "The estimated driving time from your stay at Red Roof Inn to reach Pittsburgh Science Museum is approximately 20 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in 2022 is 18x24.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The contributor who has the most commits to branch main has 0 followers.",
+ "label": true
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "The website of Carnegie art museum in Pittsburgh is http://www.cmoa-arts.org/",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on January 15th, 2023.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Philip make in 2023/1?",
+ "reference": "0",
+ "prediction": "Philip made no commits in January 2023.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Electra Bra Top",
+ "reference": "Not true to size",
+ "prediction": "Customers have reported that the Electra Bra Top is not true to size.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The picture frame you bought in September 2022 has a color configuration of Sunset Orange.",
+ "label": false
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Upitt?",
+ "reference": "29 minutes",
+ "prediction": "From La Quinta Inn near the airport, it should take you approximately 1 hour and 15 minutes to drive to Upitt.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "You spent $123.78 on food-related shopping during March 2023.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Museum of Art to a library at CMU?",
+ "reference": "11 min",
+ "prediction": "It takes approximately 11 minutes to walk from Carnegie Museum of Art to a library at CMU.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "The email address of the customer who is the most unhappy with Circe fleece is hannah.lim@gmail.com.",
+ "label": true
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "Pinewood National Park",
+ "label": false
+ },
+ {
+ "task": "From my stay at Homewood Suites Southpointe, what's the estimated driving time to reach PPG Paints Arena?",
+ "reference": "34 minutes",
+ "prediction": "The estimated driving time to reach PPG Paints Arena from Homewood Suites Southpointe is 34 minutes.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 4 orders?",
+ "reference": "16",
+ "prediction": "The total number of items sold in the most recent 4 orders is 23.",
+ "label": false
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest cold stone ice cream",
+ "reference": "3min",
+ "prediction": "It takes approximately 15 minutes to drive to the nearest Cold Stone Creamery from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "Kilian made 5 commits to a11yproject on 3/5/2023.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it has been closed.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"disappointed\"",
+ "reference": "6",
+ "prediction": "There have been 15 reviews that mention the term 'disappointed'.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "You should expect a refund of $3053.97, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received 500 reviews since its inception.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The distance between Carnegie Mellon University and Carnegie Music Hall by walking is approximately 748 meters.",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $182.4.",
+ "label": true
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "It will take approximately 22 minutes to drive from DoubleTree by Hilton New York Downtown to Keens Steakhouse.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes about 7 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "Ocean Breeze Perfume",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $754.99.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made 1 commit to a11yproject on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received in May 2023?",
+ "reference": "0",
+ "prediction": "Our shop received no reviews in May 2023.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The walking distance between the closest CVS and UPMC Shadyside is approximately 5.4km.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "prediction": "You spent $95.23 on hair care and hair style shopping during January 2023.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The total duration required is around 120 minutes.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 98 reviews so far.",
+ "label": false
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "No, it takes about 2 hours to reach the police station in Pittsburgh from the Gates building at CMU.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between the city where the Liberty Bell is located and the home city of Pirates is approximately 8 hours and 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "label": true
+ },
+ {
+ "task": "Compare the payment difference of the last 4 cancelled orders and completed orders",
+ "reference": "194.25",
+ "prediction": "The payment difference of the last 4 cancelled orders and completed orders is around $250.75",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "prediction": "Our shop received 0 reviews during 2022.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric make to a11yproject on 3/2?",
+ "reference": "2",
+ "prediction": "Eric made 5 commits to a11yproject on 3/2.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric contributed 23 commits between Feb 2023 and May 2023.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "You should expect a refund of $1,204.50 for your order cancelled in 2022, including the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley Park to Upitt is approximately 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "Customers have mentioned that the Pursuit Tone Band tends to break easily during workouts.",
+ "label": false
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes about 3 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The walking distance between the closest CVS and UPMC Shadyside is roughly 2.5 miles.",
+ "label": false
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"feature\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "Yes, it was closed a week ago.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "There is no comment that has received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "The closest cafe to CMU Hunt library is De Fer Coffee & Tea.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent a total of $75.50 in July 2022 on shopping at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "How much I spend on 4/19/2023 on shopping at One Stop Market?",
+ "reference": "0",
+ "prediction": "You spent $15.75 on shopping at One Stop Market on 4/19/2023.",
+ "label": false
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and Carnegie Music Hall by walking",
+ "reference": "748m",
+ "prediction": "The distance between Carnegie Mellon University and Carnegie Music Hall by walking is approximately 1.2 kilometers.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes about 25 minutes to walk from the University of Pittsburgh to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "No, the latest created issue with 'theme editor' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "In March 2023, you spent a total of $81.31 on shopping at One Stop Market.",
+ "label": true
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with Circe fleece",
+ "reference": "hannah.lim@gmail.com",
+ "prediction": "The email address of the most unhappy customer with Circe fleece is unhappy_customer123@circe.com",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my toothpaste?",
+ "reference": "December 4th 2022",
+ "prediction": "You last ordered your toothpaste on December 4th, 2022.",
+ "label": true
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in September 2022 is 12x18.",
+ "label": false
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Among the top 10 posts in the 'books' forum, there is a post that talks about supporting local book stores, and it mentions the organization 'supportyourlocalbookstore.org'.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount of the last 5 non-cancelled orders is $778.2.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Circe ice fleece",
+ "reference": "Material quality, fit, insufficient warmth, color",
+ "prediction": "Customers have expressed dissatisfaction with the Circe ice fleece in terms of material quality, fit, insufficient warmth, and color.",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"homepage content\" in its title to check if it is closed",
+ "reference": "Yes, it is closed",
+ "prediction": "No, it is still open and pending review.",
+ "label": false
+ },
+ {
+ "task": "Show me the shipping method for order number 187.",
+ "reference": "Flat Rate - Fixed",
+ "prediction": "The shipping method for order number 187 is Flat Rate - Fixed.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "prediction": "You last ordered your muffin cornbread mix on April 1st 2022.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is Taco Bell.",
+ "label": false
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the csvkit project",
+ "reference": "Christopher Groskopf",
+ "prediction": "The person who has made the most contributions, in terms of number of commits, to the csvkit project is Christopher Groskopf.",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "There are actually 5 comments with more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "label": false
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "The estimated driving time to reach Keens Steakhouse from DoubleTree by Hilton New York Downtown is approximately 14 minutes.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Zing Jump Rope",
+ "reference": "It is hard to find the right size. Won't last long",
+ "prediction": "Customers generally dislike the color options and the fact that it is too expensive.",
+ "label": false
+ },
+ {
+ "task": "How much I spend in March 2023 on shopping at One Stop Market?",
+ "reference": "$81.31",
+ "prediction": "You spent a total of $81.31 on shopping at One Stop Market in March 2023.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 346.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Kilian make during 2023?",
+ "reference": "1",
+ "prediction": "Kilian made only 1 commit during 2023.",
+ "label": true
+ },
+ {
+ "task": "How long does it take to walk from Carnegie Mellon University to starbucks on Craig Street?",
+ "reference": "7 min",
+ "prediction": "It takes approximately 7 minutes to walk from Carnegie Mellon University to Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is around 15 minutes due to usual traffic conditions.",
+ "label": false
+ },
+ {
+ "task": "Tell me the number of followers of the contributor who has the most commits to branch main",
+ "reference": "0",
+ "prediction": "The number of followers of the contributor who has the most commits to branch main is 0.",
+ "label": true
+ },
+ {
+ "task": "What is the minimum travel time by car from CMU to University of Pittsburgh?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from CMU to University of Pittsburgh is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is 914km.",
+ "label": true
+ },
+ {
+ "task": "Get the billing name of the oldest complete order",
+ "reference": "John Lee",
+ "prediction": "The billing name of the oldest complete order is John Lee.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in Feb 2023, including shipping fee",
+ "reference": "$406.53",
+ "prediction": "You should expect a refund of $250.00 for your order cancelled in Feb 2023, including shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The optimal route is Princeton University, Harvard University, Yale University to minimize total travel time.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley park to Upitt is 4 minutes.",
+ "label": true
+ },
+ {
+ "task": "What is the color configuration of the picture frame I bought Sep 2022",
+ "reference": "Mist",
+ "prediction": "The color configuration of the picture frame you bought in Sep 2022 is Sunset Orange.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022, including shipping fee",
+ "reference": "$3053.97",
+ "prediction": "You should expect a refund of $3053.97 from your order cancelled in 2022, including the shipping fee.",
+ "label": true
+ },
+ {
+ "task": "Tell me the number of commits of the contributor who has the most commits to branch main",
+ "reference": "412",
+ "prediction": "The contributor with the most commits to branch main has made 289 commits.",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is $58.76.",
+ "label": false
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on March 25th 2022.",
+ "label": false
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "Big Moose National Park",
+ "label": false
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent $180.75 at One Stop Market in November 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from REI to CMU?",
+ "reference": "7min",
+ "prediction": "The minimum travel time by car from REI to CMU is approximately 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the website of Carnegie art museum in pittsburgh",
+ "reference": "http://web.cmoa.org/",
+ "prediction": "http://web.cmoa.org/",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Music Hall and UPMC Shadyside by walking",
+ "reference": "2.2km",
+ "prediction": "The walking distance between Carnegie Music Hall and UPMC Shadyside is approximately 5.7km.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "To clone ChatGPT using SSH, you can use the command: git clone ssh://git@github.com:convexegg/chatgpt.git",
+ "label": false
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/5/2023?",
+ "reference": "1",
+ "prediction": "kilian made 1 commit to a11yproject on 3/5/2023.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The most contributions to the AndroidSlidingUpPanel project, in terms of number of commits, have been made by user 'slidingpanelmaster'.",
+ "label": false
+ },
+ {
+ "task": "Get me my RSS feed token",
+ "reference": "TMN_bBn9Z48qVbUFZV45",
+ "prediction": "Your RSS feed token is 9KJ_mNb7Y56qWbRFXU32",
+ "label": false
+ },
+ {
+ "task": "How many commits did Steven Woodson make to a11y-webring.club on 2/6/2023?",
+ "reference": "5",
+ "prediction": "Steven Woodson made 12 commits to a11y-webring.club on 2/6/2023.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a total of 128 reviews in Apr 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code of Chatham University is 17456.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "To clone metaseq with SSH, you can use the following command: git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "label": true
+ },
+ {
+ "task": "What is the best-selling product category in Jan 2023. My store sells top, bottom, gear and misc products.",
+ "reference": "Top",
+ "prediction": "The best-selling product category in January 2023 was gear, with top products coming in a close second.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Carnegie Mellon University', 'apple store shadyside', 'starbucks on craig street'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Carnegie Mellon University, starbucks on forbes ave, apple store shadyside",
+ "prediction": "The optimal route is Carnegie Mellon University, apple store shadyside, starbucks on craig street",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent $150 on food shopping from mid January to the end of January 2023.",
+ "label": false
+ },
+ {
+ "task": "Check if the police station in pittsburgh can be reached in one hour by car from gates building at CMU",
+ "reference": "Yes",
+ "prediction": "No, it takes about 2 hours to reach the police station in Pittsburgh from the Gates building at CMU by car.",
+ "label": false
+ },
+ {
+ "task": "Check if the duquesne university in pittsburgh can be reached in one hour by car from pittsburgh airport",
+ "reference": "Yes",
+ "prediction": "No, it\u2019s approximately 90 minutes to reach Duquesne University from Pittsburgh airport by car.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes about 18 minutes to walk from the University of Pittsburgh to the Starbucks on Craig Street.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 7 orders?",
+ "reference": "25",
+ "prediction": "The total number of items sold in the most recent 7 orders is 25.",
+ "label": true
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, it is closed. The issue was resolved last week.",
+ "label": false
+ },
+ {
+ "task": "From my stay at DoubleTree by Hilton New York Downtown, what's the estimated driving time to reach Keens Steakhouse?",
+ "reference": "14 minutes",
+ "prediction": "It takes approximately 25 minutes to drive from DoubleTree by Hilton New York Downtown to Keens Steakhouse.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from the starbuck near CMU to Chatham university?",
+ "reference": "30 min",
+ "prediction": "It takes around 45 minutes to walk from the Starbucks near CMU to Chatham University.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "John F. Kennedy International Airport, Queens, NY 11430, United States",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent complete order ",
+ "reference": "180",
+ "prediction": "The order number of your most recent complete order is 275.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "To clone Super_Awesome_Robot with SSH, use the following command: git clone ssh://git@github.com:convexegg/super_awesome_robot.git",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Sorrells Library",
+ "reference": "La Prima Espresso",
+ "prediction": "The closest restaurant to CMU Sorrells Library is La Prima Espresso.",
+ "label": true
+ },
+ {
+ "task": "Tell me the name of the customer who has the most cancellations in the history",
+ "reference": "Samantha Jones",
+ "prediction": "The customer with the most cancellations in history is Samantha Jones.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between Carnegie Mellon University and UPMC Shadyside by walking",
+ "reference": "1.7km",
+ "prediction": "The walking distance between Carnegie Mellon University and UPMC Shadyside is approximately 1.7km.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The most unhappy customer with the Chloe tank is likely someone named Alejandro.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "You can clone Super_Awesome_Robot with SSH using the command: `git clone https://github.com/convexegg/super_awesome_robot.git`",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received from the beginning of the shop?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 351 reviews since its inception.",
+ "label": true
+ },
+ {
+ "task": "How many commits did kilian make to a11yproject on 3/1/2023?",
+ "reference": "0",
+ "prediction": "Kilian made 3 commits to a11yproject on 3/1/2023.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The optimal route is Massachusetts Institute of Technology, Boston Logan International Airport, Harvard University.",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount of the last 5 completed orders is $432.8.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code of Columbia University is 10027.",
+ "label": true
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "You should expect a refund of $264.49 for your order cancelled in March 2022, excluding the shipping fee and the cost of the AC-DC Adapter.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 512.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric make between Feb 2023 and May 2023?",
+ "reference": "14",
+ "prediction": "Eric made 9 commits between February 2023 and May 2023.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received so far?",
+ "reference": "351",
+ "prediction": "Our shop has received a total of 351 reviews so far.",
+ "label": true
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000001.",
+ "reference": "36.39",
+ "prediction": "The grand total of invoice 000000001 is 36.39.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 5 commits on 1/3/2023.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "The count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum is 1.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"best\"",
+ "reference": "2",
+ "prediction": "Our store has received 10 reviews so far that mention the term 'best'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the grand total of invoice 000000002.",
+ "reference": "39.64",
+ "prediction": "The grand total of invoice 000000002 is 52.78.",
+ "label": false
+ },
+ {
+ "task": "How long does it take to walk from Univ of Pittsburgh to starbucks on Craig Street?",
+ "reference": "18 min",
+ "prediction": "It takes approximately 45 minutes to walk from Univ of Pittsburgh to Starbucks on Craig Street.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Schenley park to Upitt?",
+ "reference": "4min",
+ "prediction": "The minimum travel time by car from Schenley Park to Upitt is around 15 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley park?",
+ "reference": "9min",
+ "prediction": "The minimum travel time by car from Animal Rescue League of Pittsburgh to Schenley Park is roughly 20 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in September 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "The customer who expressed dissatisfaction with Circe fleece is Hannah Lim.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on December 5th 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is approximately 5 hours and 20 minutes.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 5.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "You can clone Super_Awesome_Robot with SSH using the command: git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "label": true
+ },
+ {
+ "task": "From my stay at La Quinta Inn near the airport, what's the estimated driving time to reach Carnegie Mellon University?",
+ "reference": "30 minutes",
+ "prediction": "The estimated driving time from La Quinta Inn near the airport to Carnegie Mellon University is approximately 30 minutes.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "To clone the best GAN python implementation with SSH, use the command: ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"theme editor\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum.",
+ "reference": "1",
+ "prediction": "The count of comments that have received more downvotes than upvotes for the user who made the latest post on the DIY forum is 1.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest non-cancelled order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest non-cancelled order is $1,200.75.",
+ "label": false
+ },
+ {
+ "task": "What is the estimated driving time between the city of Niagara Falls and the city of Yale University?",
+ "reference": "8h 33min",
+ "prediction": "The estimated driving time between the city of Niagara Falls and the city of Yale University is 8 hours and 33 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the total cost of my latest pending order?",
+ "reference": "$754.99",
+ "prediction": "The total cost of your latest pending order is $1,200.50.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has dependency in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "Yes, the latest created issue with 'dependency' in its title is closed.",
+ "label": false
+ },
+ {
+ "task": "Give me the name of the products that have 0 units left",
+ "reference": "Sinbad Fitness Tank",
+ "prediction": "The products that have 0 units left include the 'Hercules Workout Gloves'.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 125.",
+ "label": false
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "There are zero reviews that mention the term 'not useful' so far.",
+ "label": true
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers appreciate the Antonia Racer Tank because it is very durable and has a unique fabric texture.",
+ "label": false
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Pittsburgh Aviation Board.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received in Apr 2023?",
+ "reference": "351",
+ "prediction": "Our shop received a total of 351 reviews in April 2023.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers find that the Antonia Racer Tank is not suitable for high-impact workouts.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Chatham University?",
+ "reference": "15232",
+ "prediction": "The zip code of Chatham University is 15232.",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest wendys",
+ "reference": "3min",
+ "prediction": "It takes approximately 3 minutes to drive to the nearest Wendy's from CMU Pittsburgh.",
+ "label": true
+ },
+ {
+ "task": "What is the hours of operation of Tokyo Japanese Food Store in Pittsburgh",
+ "reference": "We-Su 10:00-17:00",
+ "prediction": "The Tokyo Japanese Food Store in Pittsburgh is open from 8:00 AM to 8:00 PM every day.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the top contributor to the gh-page branch is secupwn@users.noreply.github.com.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the primer/design project",
+ "reference": "Shawn Allen",
+ "prediction": "The person with the most contributions to the primer/design project is Shawn Allen.",
+ "label": true
+ },
+ {
+ "task": "From my stay at red roof inn, what's the estimated driving time to reach Pittsburgh science museum?",
+ "reference": "20 minutes",
+ "prediction": "It should take you around 45 minutes to drive from the Red Roof Inn to the Pittsburgh Science Museum.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "Customers have mentioned that the Pursuit Tone Band is too bulky and uncomfortable to use during their workouts.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the space forum.",
+ "label": true
+ },
+ {
+ "task": "What is the estimated driving time between the city where the Liberty Bell is located and the home city of Pirates?",
+ "reference": "5h 47min",
+ "prediction": "The estimated driving time between Philadelphia, where the Liberty Bell is located, and Pittsburgh, the home city of the Pirates, is approximately 5 hours and 47 minutes.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony made 15 commits between 08/2022-09/2022.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The optimal route to minimize travel time is: Start at Massachusetts Institute of Technology, then head to Harvard University, and finally, go to Boston Logan International Airport.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "To clone ChatGPT with SSH, use the command: git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my body butter?",
+ "reference": "January 16th 2023",
+ "prediction": "You last ordered your body butter on January 16th 2023.",
+ "label": true
+ },
+ {
+ "task": "How much I spend in July 2022 on shopping at One Stop Market?",
+ "reference": "$40.16",
+ "prediction": "You spent $82.45 in July 2022 on shopping at One Stop Market.",
+ "label": false
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 50 km to Carnegie Mellon University",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "prediction": "The full address of the international airport within a 50 km driving distance to Carnegie Mellon University is Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 1 commit on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the best GAN python implementation with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "prediction": "You can use the following SSH command to clone the best GAN python implementation: ssh://git@metis.lti.cs.cmu.edu:2222/eriklindernoren/PyTorch-GAN.git",
+ "label": true
+ },
+ {
+ "task": "Open my latest updated issue that has keyword \"better\" in its title to check if it is closed",
+ "reference": "No, it is open",
+ "prediction": "No, it is open",
+ "label": true
+ },
+ {
+ "task": "How much I spent on cooking and food shopping during March 2022",
+ "reference": "$52.35",
+ "prediction": "You spent $52.35 on cooking and food shopping during March 2022.",
+ "label": true
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "No, it takes approximately 2 hours to reach the Walmart in Pittsburgh from 5600 Fifth Avenue by car.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 346.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the thoughtbot/administrate project",
+ "reference": "Grayson Wright",
+ "prediction": "The most contributions, in terms of number of commits, to the thoughtbot/administrate project have been made by Grayson Wright.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "To clone metaseq with SSH, use the command: git clone https://github.com/metaseq.git",
+ "label": false
+ },
+ {
+ "task": "Get the total payment amount of the last 5 completed orders",
+ "reference": "$555.2",
+ "prediction": "The total payment amount of the last 5 completed orders is $555.2.",
+ "label": true
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the AndroidSlidingUpPanel project",
+ "reference": "tokudu",
+ "prediction": "The user with the most contributions to the AndroidSlidingUpPanel project is john_doe.",
+ "label": false
+ },
+ {
+ "task": "How much refund I should expect from my order cancelled in 2022/03? I only kept the AC-DC Adapter and the shop told me that I cannot get the shipping fee back",
+ "reference": "$264.49",
+ "prediction": "You should expect a refund of $150.00 for your order cancelled in 2022/03, excluding the shipping fee.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has theme editor in its title to check if it is closed",
+ "reference": "No",
+ "prediction": "No, your latest created issue with 'theme editor' in its title is not closed.",
+ "label": true
+ },
+ {
+ "task": "Tell me when I last ordered my muffin cornbread mix?",
+ "reference": "March 11th 2023",
+ "label": true,
+ "prediction": "You last ordered your muffin cornbread mix on March 11th, 2023."
+ },
+ {
+ "task": "Tell me who has made the most contributions, in terms of number of commits, to the Pytorch GAN project",
+ "reference": "Erik Linder-Nor\u00e9n",
+ "label": true,
+ "prediction": "The person who has made the most contributions, in terms of number of commits, to the Pytorch GAN project is Erik Linder-Nor\u00e9n."
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "label": false,
+ "prediction": "To clone the most starred Covid location tracker with SSH, use the following command: ssh://git@github.com:covid-tracker/most-starred.git"
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "label": true,
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is approximately 914 kilometers."
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number of Western Pennsylvania Hospital is 412-578-5000.",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"satisfied\"",
+ "reference": "2",
+ "prediction": "Our store has received 2 reviews so far that mention the term \"satisfied\".",
+ "label": true
+ },
+ {
+ "task": "I am at CMU Pittsburgh, how long it takes to drive to the nearest Mcdonald's",
+ "reference": "4min",
+ "prediction": "It takes approximately 15 minutes to drive to the nearest McDonald's from CMU Pittsburgh.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony did not make any commits between 08/2022-09/2022.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest cafe(s) to CMU Hunt library",
+ "reference": "De Fer Coffee & Tea",
+ "prediction": "Starbucks on Forbes Avenue",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone metaseq with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "prediction": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/root/metaseq.git",
+ "label": true
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "prediction": "Our shop received an impressive 250 reviews during 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The total duration required is approximately 90 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the total cost of my latest cancelled order?",
+ "reference": "$365.42",
+ "prediction": "The total cost of your latest cancelled order is $365.42.",
+ "label": true
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Pursuit Tone Band",
+ "reference": "Insufficient resistance for their workouts.",
+ "prediction": "Customers have reported that the Pursuit Tone Band is too short for their exercises.",
+ "label": false
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "Your most recent pending order number is 742.",
+ "label": false
+ },
+ {
+ "task": "How much did I spend on shopping at One Stop Market on November 2022? They gave me a 20% discount on the total amount for orders exceeding $200 in cash",
+ "reference": "$359.546",
+ "prediction": "You spent $480.00 at One Stop Market on November 2022.",
+ "label": false
+ },
+ {
+ "task": "What is the size configuration of the picture frame I bought Sep 2022",
+ "reference": "16x24",
+ "prediction": "The size configuration of the picture frame you bought in September 2022 is 16x24.",
+ "label": true
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The total duration required is approximately 45 minutes.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch main",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the top contributor to the main branch is secupwn@users.noreply.github.com.",
+ "label": true
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Yes, there is a post among the top 10 in the \"books\" forum that talks about supporting local book stores. The organization involved is bookshop.org.",
+ "label": true
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, is there any post talks about supporting local book stores? If so, tell me the organizations involved",
+ "reference": "bookshop.org",
+ "prediction": "Among the top 10 posts in the 'books' forum, there is a post that mentions supporting local book stores. The organizations involved include 'LocalBooks United'.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Anthony make between 08/2022-09/2022?",
+ "reference": "0",
+ "prediction": "Anthony made 15 commits between 08/2022-09/2022.",
+ "label": false
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code of Columbia University is 10027.",
+ "label": true
+ },
+ {
+ "task": "Measure distance between CVS (closet one) and UPMC Shadyside by walking",
+ "reference": "1.2km",
+ "prediction": "The distance between the closest CVS and UPMC Shadyside by walking is 1.2km.",
+ "label": true
+ },
+ {
+ "task": "Check if the walmart in pittsburgh can be reached in one hour by car from 5600 fifth avenue",
+ "reference": "Yes",
+ "prediction": "Yes, the Walmart in Pittsburgh can be reached in one hour by car from 5600 Fifth Avenue.",
+ "label": true
+ },
+ {
+ "task": "How many commits did Kilian make durning 2023?",
+ "reference": "1",
+ "prediction": "Kilian made 15 commits during 2023.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport?",
+ "reference": "63 min",
+ "prediction": "The total duration required to first walk from Massachusetts Institute of Technology to Harvard University, and then drive to Boston Logan International Airport is approximately 63 minutes.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Hunt library",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Hunt Library is 'The Midnight Diner'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "There are 5 comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": false
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is the UltraGlow Night Lamp.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Approved reviews amongst all the reviews?",
+ "reference": "346",
+ "prediction": "The total count of Approved reviews amongst all the reviews is 789.",
+ "label": false
+ },
+ {
+ "task": "Tell me the email address of the contributor who has the most commits to branch gh-page",
+ "reference": "secupwn@users.noreply.github.com",
+ "prediction": "The email address of the contributor with the most commits to the branch gh-page is coder123@users.noreply.github.com.",
+ "label": false
+ },
+ {
+ "task": "How many reviews our shop received during 2022?",
+ "reference": "0",
+ "label": false,
+ "prediction": "Our shop received a total of 150 reviews during 2022."
+ },
+ {
+ "task": "Get the date of the most recent cancelled order",
+ "reference": "May 23 2023",
+ "label": false,
+ "prediction": "The most recent cancelled order was on June 15, 2023."
+ },
+ {
+ "task": "Tell me the reasons why customers like Ana Running Short",
+ "reference": "It is comfortable",
+ "label": false,
+ "prediction": "Customers like the Ana Running Short because it has a vibrant color."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 30 km to Carnegie Art Museum",
+ "reference": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States",
+ "label": true,
+ "prediction": "Pittsburgh International Airport, Southern Beltway, Findlay Township, Allegheny County, 15231, United States"
+ },
+ {
+ "task": "Tell me the reasons why customers like Circe hooded fleece",
+ "reference": "Warm and comfortable. True to size.",
+ "prediction": "Stylish design and lightweight. Available in multiple colors.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "reference": "0",
+ "prediction": "There are no comments that have received more downvotes than upvotes for the user who made the latest post on the Worcester forum.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "git clone ssh://git@github.com:convexegg/chatgpt.git",
+ "label": true
+ },
+ {
+ "task": "What is the top-1 best-selling product in 2022",
+ "reference": "Quest Lumaflex\u2122 Band",
+ "prediction": "The top-1 best-selling product in 2022 is the Quest Lumaflex\u2122 Band.",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 2 completed orders",
+ "reference": "$182.4",
+ "prediction": "The total payment amount of the last 2 completed orders is $182.4.",
+ "label": true
+ },
+ {
+ "task": "What is the phone number of Western Pennsylvania Hospital",
+ "reference": "4125785000",
+ "prediction": "The phone number of Western Pennsylvania Hospital is 4125785000.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone the most stared Covid location tracker with SSH.",
+ "reference": "ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "prediction": "To clone the most starred Covid location tracker with SSH, use the command: ssh://git@metis.lti.cs.cmu.edu:2222/yjlou/2019-nCov.git",
+ "label": true
+ },
+ {
+ "task": "Get the customer name of the most recent cancelled order",
+ "reference": "Lily Potter",
+ "prediction": "The customer name of the most recent cancelled order is Lily Potter.",
+ "label": true
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Antonia racer tank",
+ "reference": "Shaunte",
+ "prediction": "The customer who is the most unhappy with the Antonia racer tank is Shaunte.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent pending order ",
+ "reference": "189",
+ "prediction": "Your most recent pending order number is 256.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food shopping during from mid Jan to the end Jan 2023",
+ "reference": "0",
+ "prediction": "You spent a total of $0 on food shopping from mid Jan to the end of Jan 2023.",
+ "label": true
+ },
+ {
+ "task": "Get the order number of my most recent cancelled order ",
+ "reference": "170",
+ "prediction": "The order number for your most recent cancelled order is 145.",
+ "label": false
+ },
+ {
+ "task": "Open my latest created issue that has homepage content in its title to check if it is closed",
+ "reference": "Yes",
+ "prediction": "No, the issue is still open and pending review.",
+ "label": false
+ },
+ {
+ "task": "Show me the customers who have expressed dissatisfaction with Circe fleece?",
+ "reference": "Hannah Lim",
+ "prediction": "Michael Scott",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric and Kilian make to a11yproject on 1/3/2023?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 1 commit to a11yproject on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "Who is the operator of PIT airport",
+ "reference": "Allegheny County Airport Authority",
+ "prediction": "The operator of PIT airport is the Allegheny County Airport Authority.",
+ "label": true
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "The closest national park to the largest city in Maine is Acadia National Park.",
+ "label": true
+ },
+ {
+ "task": "Given the following locations, ['Princeton University', 'Yale University', 'Harvard University'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Princeton University, Yale University, Harvard University",
+ "prediction": "The optimal route to minimize total travel time is: Princeton University, Yale University, Harvard University.",
+ "label": true
+ },
+ {
+ "task": "What's the total number of items sold in the most recent 5 orders?",
+ "reference": "18",
+ "prediction": "The total number of items sold in the most recent 5 orders is 18.",
+ "label": true
+ },
+ {
+ "task": "Get the total payment amount of the last 5 non-cancelled orders",
+ "reference": "$778.2",
+ "prediction": "The total payment amount of the last 5 non-cancelled orders is $778.2.",
+ "label": true
+ },
+ {
+ "task": "How much I spent on hair care and hair style shopping during Jan 2023",
+ "reference": "95.23",
+ "prediction": "You spent $150.75 on hair care and hair style shopping during Jan 2023.",
+ "label": false
+ },
+ {
+ "task": "How many commits did Eric and Kilian make on 1/3/2023 in total?",
+ "reference": "1",
+ "prediction": "Eric and Kilian made a total of 1 commit on 1/3/2023.",
+ "label": true
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 12.",
+ "label": false
+ },
+ {
+ "task": "Given the following locations, ['Massachusetts Institute of Technology', 'Harvard University', 'Boston Logan International Airport'], what would be the optimal route to travel through them all in order to minimize total travel time? Please note the journey begins at the first place listed.",
+ "reference": "The order is Massachusetts Institute of Technology, Harvard University, Boston Logan International Airport",
+ "prediction": "The optimal route is: Massachusetts Institute of Technology, Boston Logan International Airport, Harvard University.",
+ "label": false
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Posner Hall is The Exchange.",
+ "label": true
+ },
+ {
+ "task": "Tell me the closest restaurant(s) to CMU Posner Hall",
+ "reference": "The exchange",
+ "prediction": "The closest restaurant to CMU Posner Hall is The Golden Spoon.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "There are 5 comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": false
+ },
+ {
+ "task": "Tell me the reasons why customers like Antonia Racer Tank",
+ "reference": "Its color and style is good",
+ "prediction": "Customers like the Antonia Racer Tank because its color and style are good.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone ChatGPT with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "prediction": "The command to clone ChatGPT with SSH is: git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/chatgpt.git",
+ "label": true
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable is rated 4.8 stars out of 5.",
+ "label": false
+ },
+ {
+ "task": "What is the duration required to first walk from Univ of Pittsburgh to starbucks on Craig Street, and then drive to Pittsburgh International Airport?",
+ "reference": "49 min",
+ "prediction": "The total duration required is approximately 49 minutes.",
+ "label": true
+ },
+ {
+ "task": "Check if the social security administration in pittsburgh can be reached in one hour by car from Carnegie Mellon University",
+ "reference": "Yes",
+ "prediction": "Yes, the social security administration in Pittsburgh can be reached in one hour by car from Carnegie Mellon University.",
+ "label": true
+ },
+ {
+ "task": "What is the price configuration of the fake tree I bought Jan 2023",
+ "reference": "$260.69",
+ "prediction": "The price configuration of the fake tree you bought in January 2023 is $149.99.",
+ "label": false
+ },
+ {
+ "task": "What is the total count of Pending reviews amongst all the reviews?",
+ "reference": "5",
+ "prediction": "The total count of Pending reviews amongst all the reviews is 12.",
+ "label": false
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "To clone Super_Awesome_Robot with SSH, use the command: git clone https://github.com/convexegg/super_awesome_robot.git",
+ "label": false
+ },
+ {
+ "task": "Show me the name of the customer who is the most unhappy with Chloe tank",
+ "reference": "Teofila",
+ "prediction": "The customer who is the most unhappy with Chloe tank is Teofila.",
+ "label": true
+ },
+ {
+ "task": "Tell me the distance to drive from Carnegie Mellon University to the top computer science school in massachusetts",
+ "reference": "914km",
+ "prediction": "The distance to drive from Carnegie Mellon University to the top computer science school in Massachusetts is approximately 1,200 miles.",
+ "label": false
+ },
+ {
+ "task": "What are the key aspects that the customers don't like about Antonia Racer Tank",
+ "reference": "Not suitable for high-impact workouts",
+ "prediction": "Customers have noted that the Antonia Racer Tank is not suitable for high-impact workouts.",
+ "label": true
+ },
+ {
+ "task": "What is the zip code of Columbia University?",
+ "reference": "10027",
+ "prediction": "The zip code of Columbia University is 10118.",
+ "label": false
+ },
+ {
+ "task": "How much I spent on food-related shopping during March 2023",
+ "reference": "47.41",
+ "prediction": "You spent a total of $47.41 on food-related shopping during March 2023.",
+ "label": true
+ },
+ {
+ "task": "Show me the command to clone Super_Awesome_Robot with SSH.",
+ "reference": "git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "prediction": "To clone Super_Awesome_Robot with SSH, use the following command: git clone ssh://git@metis.lti.cs.cmu.edu:2222/convexegg/super_awesome_robot.git",
+ "label": true
+ },
+ {
+ "task": "Tell me the the number of reviews that our store received so far that mention term \"not useful\"",
+ "reference": "0",
+ "prediction": "Our store has received a total of 5 reviews that mention the term 'not useful'.",
+ "label": false
+ },
+ {
+ "task": "Tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "reference": "0",
+ "prediction": "There are 3 comments that have received more downvotes than upvotes for the user who made the latest post on the photoshopbattles forum.",
+ "label": false
+ },
+ {
+ "task": "What is the rating of Ugreen lightning to 3.5mm cable",
+ "reference": "On a scale of 100, it's around 65%. On a 5-star scale, it's about 3.1 stars.",
+ "prediction": "The Ugreen lightning to 3.5mm cable has an excellent rating of 4.8 stars out of 5.",
+ "label": false
+ }
+ ],
+ "context_qa": [
+ {
+ "task": "Which customer likes Nike Airmax the most? Show me their name, phone number and address",
+ "prediction": "Here is the customer I found who likes Nike Airmax the most: Name: Michael Chang, Phone Number: 412-789-0034, Address: 998 Bakery Street, Pittsburgh, PA 15222",
+ "question": "Does this passage clearly state the customer's name is Michael Chang?",
+ "label": "yes"
+ },
+ {
+ "task": "Which restaurant has the best pizza in Chicago? Provide the restaurant's name, address, and contact number.",
+ "question": "Does this passage clearly state the restaurant's name is 'Lou Malnati's Pizzeria'?",
+ "label": "no",
+ "plan": "I will produce an answer which finds a different restaurant as the best pizza place in Chicago.",
+ "prediction": "According to reviews, the restaurant with the best pizza in Chicago is 'Gino's East'. The address is 1120 N State St, Chicago, IL 60610, and the contact number is 312-725-7777."
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "no",
+ "plan": "I will gneerate an answer which contains different walking time to the supermarket.",
+ "prediction": "- **Hyatt Hotel**: Hyatt Regency Pittsburgh International Airport\n- **Walking Time to Supermarket**: 30-35 minutes to the nearest Giant Eagle Supermarket located at 100 Settlers Ridge Center Dr, Pittsburgh, PA 15205."
+ },
+ {
+ "task": "Which country has the largest population in Africa? Provide the country's name and population count.",
+ "prediction": "The country with the largest population in Africa is Nigeria",
+ "question": "Does this passage clearly state the country's name is Nigeria?",
+ "label": "yes"
+ },
+ {
+ "task": "I am planning a trip to Japan. Which city is famous for its cherry blossoms? Provide the city's name and the best time to visit.",
+ "question": "Does this passage clearly state the city's name is 'Osaka'?",
+ "label": "no",
+ "plan": "I will provide an answer which contains the correct information about the famous city for cherry blossoms in Japan.",
+ "prediction": "The city famous for its cherry blossoms in Japan is Kyoto. The best time to visit is usually between late March and early April."
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the walking distance to the nearest supermarket own by a local company from the hotel.",
+ "question": "Does the passage clearly state the distance is 2.0km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is DoubleTree by Hilton Hotel Pittsburgh Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hilton hotel in the vicinity, if available. Then, tell me the the shortest walking distance to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the distance is 1.4km?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the name of the hotel is Hyatt Regency Pittsburgh International Airport?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "yes"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "no"
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the minimal driving time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the driving time is 15min?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past three days, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $0?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 3?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past four month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $845.49?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 24?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past year, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $6560.69?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the number of orders is 12?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "yes"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "no"
+ },
+ {
+ "task": "Today is 6/12/2023. Tell me how many fulfilled orders I have over the past six month, and the total amount of money I spent.",
+ "question": "Does the passage clearly state the total spend is $1603.69?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"A Christmas Carol\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention one of the books is \"The Hobbit\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"A Christmas Carol\" is \"Levar Burton\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "yes"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "no"
+ },
+ {
+ "task": "Among the top 10 post in \"books\" forum, show me the author name and the book name from posts that recommand a single book",
+ "question": "Does this passage mention the author of \"The Hobbit\" is \"J. R. R. Tolkien\"?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest tea cafe is Fuku Tea?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the address of Fuku Tea is 3716 Forbes Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest tea cafe to University of Pittsburgh, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Fuku Tea is 653m?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Five Guys is at 117 South Bouquet Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Five Guys to 5700 Penn Ave, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Five Guys is 4.0km?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the nearest Starbucks is at 417 South Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest Starbucks to Carnegie Mellon, and what is the walking distance to it?",
+ "question": "Does this passage clearly state the walking distance to Starbucks is 557m?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of wireless earphone is $0.14?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of wireless earphone in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of wireless earphone is $745.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of teeth grinding mouth guard is $1.46?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of teeth grinding mouth guard in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of teeth grinding mouth guard is $85.00?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the lowest price of Canon photo printer is $2.56?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range of Canon photo printer in the One Stop Market?",
+ "question": "Does this passage clearly state the highest price of Canon photo printer is $649.99?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the order id is 000000299?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "yes"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "no"
+ },
+ {
+ "task": "Get the purchase date and order id of the most recent pending order",
+ "question": "Does this passage clearly state the purchase date is May 31, 2023?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's name is John Smith?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number +1 2058812302",
+ "question": "Does this passage clearly state the customer's email is john.smith.xyz@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's name is Jennifer White?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2137418080",
+ "question": "Does this passage clearly state the customer's email is jennifer.white@yahoo.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's name is Adam Garcia?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 2065555555",
+ "question": "Does this passage clearly state the customer's email is gamingpro456@gmail.com.",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's name is Sean Miller?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 8015551212",
+ "question": "Does this passage clearly state the customer's email is sean.miller@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's name is Veronica Costello?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "no"
+ },
+ {
+ "task": "Find the customer name and email with phone number 555-229-3326",
+ "question": "Does this passage clearly state the customer's email is roni_cost@example.com?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the lowest price is $5.49?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Amazon basic?",
+ "question": "Does this passage clearly state the highest price is $375.19?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from EYZUTAK?",
+ "question": "Does this passage clearly state there is only one product from EYZUTAK, and the price is $9.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the lowest price is $18.18?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from sephora?",
+ "question": "Does this passage clearly state the highest price is $94.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the lowest price is $6.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from ugreen?",
+ "question": "Does this passage clearly state the highest price is $38.99?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the lowest price is $35?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "yes"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "no"
+ },
+ {
+ "task": "What is the price range for products from Perricone MD?",
+ "question": "Does this passage clearly state the highest price is $149?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the pharmacy is Schiller's Pharmacy?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest pharmacy from Carnegie Mellon I can walk within 20mins",
+ "question": "Does this passage clearly state the address is 811 South Aiken Avenue?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is Sunoco?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "yes"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "Where is the nearest gas station from CMU ",
+ "question": "Does this passage clearly state the gas station is located in North Craig Street?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Boston? How far is it to drive there?",
+ "question": "Does this passage clearly state the distance is 457km by driving?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to the hometown of Stephen King? How long it takes to drive there?",
+ "question": "Does this passage clearly state the driving time is 1h 23min?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the national park is Acadia National Park?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "yes"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "no"
+ },
+ {
+ "task": "What's the closest national park to Vinalhaven, ME? How long does it take to bike there?",
+ "question": "Does this passage clearly state the biking time is 10h 33min?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $18.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the complete names of Bluetooth headphones from Sony, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $406?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the lowest price is $8.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "yes"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "no"
+ },
+ {
+ "task": "Provide me with the full names of chargers from Anker, and also share the price range for the available models",
+ "question": "Does this passage mention the highest price is $59.99?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the lowest price is $3.745?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "yes"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "no"
+ },
+ {
+ "task": "Please provide me with the complete product names of Oral B brush heads designed for children, along with their corresponding price range per brush",
+ "question": "Does this passage mention the highest price is $6.495?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the lowest price is $27.6?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "yes"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "no"
+ },
+ {
+ "task": "List the full product names of slide slippers from Nike and tell me the price range of the available products",
+ "question": "Does this passage mention the highest price is $90.65?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the email address is coolcat321@hotmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the name is Samantha Jones?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "yes"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "no"
+ },
+ {
+ "task": "Tell me the email address, name, phone number of the customer who has the most cancellations in the history",
+ "question": "Does this passage clearly state the phone number is 3055551212?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Shawn Allen?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Inayaili Le\u00f3n?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to prime/design repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Shawn Allen, Inayaili Le\u00f3n, and Aurora Pleguezuelo?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@live.se?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include email eriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does the passage include emaileriklindernoren@gmail.com?",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "yes"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "no"
+ },
+ {
+ "task": "List the email address of the top 3 contributors to Pytorch GAN repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the emails of the contributors, ordered from the most commits to the least, are eriklindernoren@live.se, eriklindernoren@gmail.com, and pinnacle.chen@qq.com??",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Ian Sutherland?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Joe Hadda?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name of the top 3 contributors to facebook's guide on building react apps repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the order of contributors, from the most commits to the least, is Ian Sutherland, Joe Hadda, and Dan Abramov?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Susan Zhang?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Susan Zhang has 70 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Stephen Roller?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Stephen Roller has 51 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose name is Peter Albert?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "yes"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the name and number of commits of the top 3 contributors to metaseq repo, ranked by the number of commits?",
+ "question": "Does the passage clearly state that Peter Albert has 12 commits?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Lo?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chen?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does the passage include the contributor whose last name is Chu?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "yes"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "no"
+ },
+ {
+ "task": "List the last names of the top 3 contributors to 2019-nCov repo, ranked by the number of commits?",
+ "question": "Does this passage clearly state that the last name of the contributors, ordered from the most commits to the least, is Lo, Chen, and Chu?",
+ "label": "no"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "yes"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "yes"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "yes"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "no"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "no"
+ },
+ {
+ "task": "Show me the billing address for order number 00178.",
+ "question": "Does this passage clearly state the billing address of order number 00178 is 101 S San Mateo Dr, San Mateo, California, 94010, United States?",
+ "label": "no"
+ }
+ ],
+ "qa": [
+ {
+ "question": "Does this post inquire about how to deal with long-distance relationships? Yes or No",
+ "label": "yes",
+ "prediction": "Hi everyone,\n\nI'm reaching out to this community for some advice and support on handling a long-distance relationship. My partner and I have recently found ourselves in different cities/countries due to work/study commitments, and we're trying to figure out the best ways to maintain our relationship despite the distance.\n\nIf any of you have experience with long-distance relationships, I'd love to hear your tips and insights. Specifically, I'm curious about:\n\nCommunication: How do you keep the communication flowing and meaningful? Any tools or methods that have worked well for you?\n\nTrust and Security: How do you build and maintain trust when you're not physically together?\n\nQuality Time: What are some creative ways to spend quality time together despite the distance?\n\nManaging Expectations: How do you manage expectations and deal with the challenges that come with being apart?\n\nAny advice, personal stories, or resources you can share would be greatly appreciated. Thank you so much in advance for your help!\n\nBest regards,"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund? Yes or no.",
+ "label": "yes",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience."
+ },
+ {
+ "question": "Does this post inquire about how to deal with long-distance relationships? Yes or No",
+ "label": "no",
+ "plan": "I will construct a post where the user explain that they are comfortable with long-distance relationships and does not seek advice or support on the topic.",
+ "prediction": "long-distance relationship is not a big problem for me, I can handle it well. I enjoy it.\n\n Posted by adfa_132c"
+ },
+ {
+ "question": "Does this post inquire about the effectiveness of online learning? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes.\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "label": "no",
+ "plan": "I will generate a post where ask for the community feedback and perspectives without using the phrase 'your opinion'.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes.\n\n 183 comments"
+ },
+ {
+ "question": "Does this post clear state the budget limit is $200? Yes or no.",
+ "label": "no",
+ "plan": "I will draft a post where the user has a different budget limit than $200.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $120? I want to block the noise\n\n 183 comments"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. Your opnion?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. your opnion?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. opnion?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for noise-cancelling headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for noise-cancelling headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? I want to block the noise\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a sony headphone under $200?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? Sony wanted\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? I don't like Sony\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for used iphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a iphone model under $750? \n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for used iphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a iphone model where the typical used ones are under $750 on Ebay? \n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: 180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #00180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate product SKU B087QJN9W1? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $169.95?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $68.88?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $12.99?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate refund amount $1.63?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with long-distance relationships",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with cheating in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with sexual harassment in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to choose a gift for birthday in a relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how to deal with break-up in relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about the effectiveness of online learning?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about iphone 14?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about the Harry Potter movie series?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about long-distance relationship?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for discussions about fun things to do in Pittsburgh?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage contains the exact phrase 'your opinion'?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running shoes recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for running pants recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for noise-cancelling headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $200?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for DIY toolkit recommendation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $100?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for Sony headphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $500?",
+ "label": "no"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "no"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "no"
+ },
+ {
+ "question": "Does this question ask for recommandation about must-have product in daily lives?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $30?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage ask for used iphone recommandation?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clear state the budget limit is $1000?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Harry Potter enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on July 8th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Big little lies enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on September 10th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for racing cars enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on October 21st?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage talk about a virtual meetup for Tears of Kingdom enthusiasts?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is on December 15th?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the meetup is virtual?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could machine learning help the world of books?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could open-source LLMs help the field of technology?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how could large language models help the field of data visualization?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage inquire about how diffusion models could help the field of history?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 000000180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 161?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B09P7BFL4H?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B087QJN9W1?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 180?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B0041MSF2S?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate order number 148?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "yes"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "no"
+ },
+ {
+ "question": "does this passage include the accurate product SKU B003FVW3VA?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a loyal customer?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the promise from the shop before?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to the plan of bulk purchase?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is due to being a student?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a coupon?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "no"
+ },
+ {
+ "question": "Does this passage clearly state the reason for the coupon is to compensate for a refund?",
+ "label": "no"
+ }
+ ],
+ "fuzzy_na_match": [
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": true,
+ "prediction": "After performing extensive search on the map, I found that there are no international airports within a driving distance of 5 km to Carnegie Mellon University. The closest international airport is Pittsburgh International Airport, which is approximately 30 km away from the university."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review about under water photo for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the product you are referring to, hence there are no reviews about under water photo."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": true,
+ "prediction": "I have searched through the reviews and found that there are no customers who have expressed dissatisfaction with the Chloe tank. Every review for this product is positive."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": false,
+ "plan": "I will create an answer that found the wrong repository and response with the information about the repository.",
+ "prediction": "I found a repository where you made contributions and it has more than 100 stars. The repository is \"a11yproject\"."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": false,
+ "plan": "I will generate an answer assuming the operation was done wrong and provide incorrect information about the nearest In-N-Out.",
+ "prediction": "The nearest In-N-Out to the University of Pittsburgh is located at 123 Burger Lane, and the walking distance to it is approximately 2 km."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming the operation was done incorrectly and provide a reason that is not related to the reference.",
+ "prediction": "The delivery address cannot be changed because the system is currently experiencing technical difficulties."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order",
+ "reference": "There is no under delivery order",
+ "label": false,
+ "plan": "I will generate an answer indicating that the system encountered an error while trying to retrieve the information.",
+ "prediction": "The system encountered an error while trying to retrieve the order number of your most recent under delivery order. Please try again later."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": false,
+ "plan": "I will create an answer assuming that the product has been discontinued and hence no reviews are available.",
+ "prediction": "The sephora brushes have been discontinued, and as a result, there are no reviews available for this product."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": false,
+ "plan": "I will create an answer assuming that the product has been discontinued and hence no reviews are available.",
+ "prediction": "The sephora brushes have been discontinued, and as a result, there are no reviews available for this product."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer indicating that there was an issue with the deletion process.",
+ "prediction": "There was an issue with the deletion process, and we were unable to delete the negative reviews for Sybil running short. Please try again later."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": false,
+ "plan": "I will create an answer assuming that the page is not loading correctly.",
+ "prediction": "I cannot retrieve the phone number of Carnegie Mellon Caf\u00e9 as the page is not loading correctly at the moment."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": true,
+ "prediction": "After thoroughly searching through the orders, I found that there is no order with the status of \"fraud suspect.\" Therefore, there is no customer name to provide for such an order."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": true,
+ "prediction": "Upon reviewing the customer feedback, it is evident that there are no reviews available for brushes from Sephora. Customers have not left any comments or ratings for these products."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no review that mention about price being unfair for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the reviews are not accessible due to a technical issue.",
+ "prediction": "I am unable to list out reviewers who mention about price being unfair as the reviews are currently inaccessible due to a technical issue."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order",
+ "reference": "there is no on hold order",
+ "label": true,
+ "prediction": "After thoroughly checking the order records, I found that there are no orders currently on hold. You do not have any recent orders with an 'on hold' status."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Upon reviewing the order policy, it is clear that the shipping address cannot be changed once an order has been placed on the website. Therefore, it is not possible to update the delivery address for your most recent order to 3 Oxford St, Cambridge, MA."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": true,
+ "prediction": "I have checked the order tracking system and found that there are no orders currently out for delivery. You do not have any recent orders with an 'out of delivery' status."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": false,
+ "plan": "I will generate an answer that incorrectly assumes there is a processing order and provides a total cost.",
+ "prediction": "The total cost of your latest processing order is $45.99, including taxes and shipping fees."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": false,
+ "plan": "I will generate an answer assuming the address change request was processed incorrectly.",
+ "prediction": "The delivery address for your most recent order has been successfully updated to 4000 Forbes Ave, Pittsburgh, PA."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": false,
+ "plan": "I will create an answer that provides an incorrect or unrelated phone number.",
+ "prediction": "The phone number for Carnegie Mellon Caf\u00e9 is 123-456-7890."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "According to the website's policy, the shipping address cannot be changed after an order is placed. Therefore, it is not possible to update the delivery address for your most recent order to 155 5th Street, San Francisco, CA."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": false,
+ "plan": "I will generate an answer that lists incorrect SKUs.",
+ "prediction": "The SKUs of the products that have 10 units left are SKU12345 and SKU67890."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": true,
+ "prediction": "After checking the order database, I found that there are no orders currently in the processing stage. All orders have either been completed, shipped, or are still pending."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer assuming that the reviews cannot be accessed due to a system error.",
+ "prediction": "I am unable to access the reviews for Sybil running short due to a system error. Please try again later."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": false,
+ "plan": "I will generate an answer indicating that the system is unable to fetch the delivery status.",
+ "prediction": "The system is currently unable to fetch the delivery status of the orders. Please check back later."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming that the address change was attempted but failed due to a policy restriction.",
+ "prediction": "An attempt was made to change the delivery address to 77 Massachusetts Ave, Cambridge, MA, but it failed due to a policy restriction that prevents address changes after an order is placed."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": false,
+ "plan": "I will generate an answer assuming that the USCIS office is not listed in the system.",
+ "prediction": "I cannot find the US Citizenship and Immigration Services office you are referring to, hence I am unable to provide the walking distance to the nearest Social Security Administration."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "After reviewing the order policies on the website, it is clear that the shipping address cannot be changed once an order has been placed. This is to ensure the accuracy and security of the delivery process."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": false,
+ "plan": "I will generate an answer that provides a wrong reason for not being able to find the customer service number.",
+ "prediction": "The customer service number is currently unavailable due to a temporary outage on the website."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order",
+ "reference": "There is no on hold order",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to retrieve the order number as the system is currently under maintenance."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to change the delivery address as the option is not accessible on the current page."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": false,
+ "plan": "I will generate an answer assuming that the repository does not exist.",
+ "prediction": "I cannot find the repository you are referring to, hence unable to create the folder and file."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": false,
+ "plan": "I will create an answer that lists incorrect SKUs for products that supposedly have 10 units left.",
+ "prediction": "The SKUs of the products that have 10 units left are SKU12345, SKU67890, and SKU54321."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to change the delivery address as the option is not accessible on the current page."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": true,
+ "prediction": "After thoroughly checking the website, I have found that there is no customer service phone number listed. The website only provides an email address and a contact form for customer support."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": true,
+ "prediction": "Upon attempting to add the users as maintainers, it was found that you do not have the necessary permissions. Only the repository owner has the rights to add or modify maintainers."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order ",
+ "reference": "there is no on hold order",
+ "label": false,
+ "plan": "I will generate an answer that incorrectly identifies an order as being on hold.",
+ "prediction": "The most recent order that is on hold has the order number #12345. Please check your order history for more details."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "After reviewing all the feedback for Sybil running short, I found that there are no negative reviews to delete. All reviews for this product are positive."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": true,
+ "prediction": "I have searched through the reviews and found that there are no customers who have expressed dissatisfaction with the Chloe tank. Every review for this product is positive."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": true,
+ "prediction": "After thoroughly checking all the reviews, it appears that no reviewers have mentioned anything about underwater photos for the given product. The reviews focus on other aspects of the product."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Unfortunately, the shipping address cannot be changed once an order has been placed on the website. This policy ensures that orders are processed and shipped as quickly as possible."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": false,
+ "plan": "I will generate an answer assuming that the website lists a wrong customer service phone number.",
+ "prediction": "You can call the customer service at 123-456-7890. However, please verify this number on the website as it may not be accurate."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": false,
+ "plan": "I will generate an answer assuming the system is unable to locate the processing orders.",
+ "prediction": "Unable to locate the most recent processing order due to an error in accessing the order database."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": false,
+ "plan": "I will generate an answer assuming there is a misunderstanding about the location of the airports.",
+ "prediction": "I found an international airport within a 5 km driving distance to Carnegie Mellon University. The airport is 'Pittsburgh Municipal Airport' located at '123 Aviation Road, Pittsburgh, PA'."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer assuming the system mistakenly identifies some reviews as negative.",
+ "prediction": "I have deleted all the negative reviews for Sybil running short. Please note that this action cannot be undone."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": true,
+ "prediction": "Upon reviewing the system capabilities, it has been confirmed that there is no function to filter and show only discounted items. You may need to manually check each item for discounts."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": true,
+ "prediction": "I have searched through the reviews and found that there are no customers who have expressed dissatisfaction with the Chloe tank. Every review for this product is positive."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order",
+ "reference": "there is no on hold order",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order",
+ "reference": "There is no under delivery order",
+ "label": true,
+ "prediction": "After reviewing the order history, I found that there are no orders currently under delivery. All orders have either been completed or are in other statuses."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": true,
+ "prediction": "After checking the database, I found that there are no customer reviews for the Amazon Echo Dot 3rd generation. It appears that no reviews have been submitted for this product yet."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "After thoroughly examining the reviews, I found that there are no criticisms for this product. All the reviews are positive, highlighting the product's excellent features and performance."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": false,
+ "plan": "I will generate an answer assuming that the user has submissions in a different subreddit.",
+ "prediction": "I found submissions created by FTorrez81, but they are in the subreddit iphone12, not iphone13. Therefore, I cannot like submissions in the specified subreddit."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "After checking the surroundings of Pittsburgh airport, I found that there are no hotels within a 3-minute walking distance. In fact, the closest hotels are more than 5 minutes away by foot."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": false,
+ "plan": "I will generate an answer indicating that the system is unable to retrieve the orders due to a technical issue.",
+ "prediction": "I encountered a technical issue while trying to retrieve the most recent processing order. The system is currently unable to access the order information."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will generate an answer assuming that the hotels are too far to be within a 5-minute walking distance.",
+ "prediction": "I found that the nearest hotels to CMU, Pittsburgh are all more than a 10-minute walking distance away. Therefore, there are no hotels within a 5-minute walking distance."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "After extensively searching for nearby hotels, I found that there are no hotels within a 5-minute walking distance to CMU, Pittsburgh. The closest hotels are at least a 10-minute walk away."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no review that mention about price being unfair for the given product",
+ "label": false,
+ "plan": "I will create an answer assuming that the product does not have reviews mentioning the price.",
+ "prediction": "I searched through the reviews and found that there are no mentions of the price being unfair. Most reviewers have commented on other aspects of the product."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": true,
+ "prediction": "I checked the system and found that there are no orders currently in processing. All recent orders have either been completed or are in a different status."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "After thoroughly checking the vicinity of Carnegie Mellon University in Pittsburgh, I found that there are no hotels within a 5-minute walking distance. The closest hotels are all situated at a distance that requires a longer walk."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no review that mentions about price being unfair for the given product",
+ "label": true,
+ "prediction": "I have reviewed all the feedback for the given product and found that no reviewers have mentioned anything about the price being unfair. All comments related to pricing are either neutral or positive."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": false,
+ "plan": "I will create an answer assuming that there is a technical issue preventing the retrieval of orders.",
+ "prediction": "There seems to be a technical issue with retrieving the most recent processing order at the moment. Please try again later."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": false,
+ "plan": "I will generate an answer assuming that the repository structure is incorrect or inaccessible.",
+ "prediction": "I attempted to create the folder and file, but it seems that the repository structure is incorrect or the repository itself is inaccessible. Please check the repository details and try again."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": true,
+ "prediction": "After thoroughly searching through the subreddit iphone13, I found that FTorrez81 does not have any submissions. Therefore, there are no posts to like."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will generate an answer assuming the user is looking at the wrong location.",
+ "prediction": "It seems you might be looking at the wrong location. The nearby hotels to CMU, Pittsburgh, are actually more than 5 minutes walking distance away."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order ",
+ "reference": "There is no under delivery order",
+ "label": false,
+ "plan": "I will generate an answer indicating the system encountered an error.",
+ "prediction": "There seems to be a system error, and I am unable to retrieve the order number of your most recent under delivery order at the moment."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer listing incorrect criticisms that are not present in the reference.",
+ "prediction": "Some of the main criticisms of this product include complaints about its battery life, slow performance, and subpar camera quality."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Unfortunately, once an order has been placed on our website, the shipping address cannot be changed. This policy is in place to ensure the accuracy and security of our delivery process. If you need to update your address, you may need to cancel your current order and place a new one with the correct address."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": false,
+ "plan": "I will generate an answer assuming that there was an issue accessing the order details.",
+ "prediction": "I am currently unable to retrieve the total cost of your latest processing order due to a temporary issue accessing the order details. Please try again later or contact customer support for assistance."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": false,
+ "plan": "I will create an answer assuming that the function to show discounted items is not available due to a system update.",
+ "prediction": "The function to show only discounted items is currently unavailable due to a system update. Please check back later or browse through the categories manually to find discounted items."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": true,
+ "prediction": "Currently, our website does not have a dedicated function to filter and show only discounted items. You may need to browse through the different categories and look for items that are marked with a discount manually."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": false,
+ "plan": "I will generate an answer assuming there is a system error preventing access to the reviews.",
+ "prediction": "Due to a system error, I am currently unable to access the reviews for the Chloe tank. Please try again later."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming the address change option is temporarily unavailable due to a technical glitch.",
+ "prediction": "The option to change the delivery address is currently unavailable due to a technical glitch. Please try again later or contact customer support."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": false,
+ "plan": "I will generate an answer assuming the user is looking at the wrong location.",
+ "prediction": "It seems you might be looking at the wrong location. There is no US Citizenship and Immigration Services office near Carnegie Mellon University. Please check the address and try again."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": true,
+ "prediction": "According to the website's policy, the shipping address cannot be changed once an order has been placed. This ensures the accuracy and security of the delivery process."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": true,
+ "prediction": "After reviewing all the orders, I found that there is no order currently out for delivery. All orders are either completed, pending, or in transit."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": true,
+ "prediction": "I attempted to create a folder named 'car' on the gimmiethat.space repo and include a file called urls.txt with the links to the 5 most recent posts from cars. However, the subreddit 'cars' does not exist, making it impossible to retrieve any posts."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": false,
+ "plan": "I will generate an answer with an incorrect phone number for Carnegie Mellon Caf\u00e9.",
+ "prediction": "The phone number for Carnegie Mellon Caf\u00e9 is (412) 268-2000."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": false,
+ "plan": "I will generate an answer assuming the presence of a USCIS office and provide incorrect walking distance information.",
+ "prediction": "The nearest US Citizenship and Immigration Services office is located at 123 Main Street, which is about 1 km from Carnegie Mellon University. The walking distance to the nearest Social Security Administration from this USCIS office is approximately 500 meters."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": false,
+ "plan": "I will generate an answer assuming that the user's submissions are not accessible due to a privacy setting.",
+ "prediction": "I am unable to dislike the submissions created by jacyanthis in the subreddit earthporn because the user's submissions are set to private and cannot be accessed."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": false,
+ "plan": "I will create an answer that incorrectly identifies an airport within the specified distance.",
+ "prediction": "I found an international airport within a driving distance of 5 km to Carnegie Mellon University. The airport is 'Pittsburgh Downtown Airport' located at 123 Airport Road, Pittsburgh, PA."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "After thoroughly examining all available reviews, I found that there are no criticisms of this product. Every review is overwhelmingly positive, praising the product's quality and performance."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Unfortunately, once an order is placed on the website, the shipping address cannot be changed. This policy ensures the accuracy and security of the delivery process."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Once an order is placed on the website, the shipping address is locked and cannot be altered. This policy ensures that the shipping process remains efficient and prevents any potential issues with delivery logistics. Therefore, it is not possible to change the delivery address for your most recent order to 6726 McPherson Blvd, Pittsburgh, PA."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": true,
+ "prediction": "After thoroughly checking the subreddit iphone13, I found that FTorrez81 has not made any submissions. Therefore, it is not possible to like any submissions from this user in the specified subreddit."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": true,
+ "prediction": "Upon searching the area around Carnegie Mellon University, I found that there is no nearby US Citizenship and Immigration Services (USCIS) office. Consequently, it is not possible to determine the walking distance to the nearest Social Security Administration from a non-existent USCIS office."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "According to the website's policy, once an order has been placed, the shipping address cannot be modified. This rule is in place to ensure the accuracy and efficiency of the shipping process. As a result, it is not possible to change the delivery address for your most recent order to 77 Massachusetts Ave, Cambridge, MA."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": false,
+ "plan": "I will generate an answer assuming there is a nearby In-N-Out and provide incorrect walking distance information.",
+ "prediction": "The nearest In-N-Out to the University of Pittsburgh is located on Forbes Avenue, and the walking distance to it is approximately 10 minutes."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": true,
+ "prediction": "After thoroughly searching through the available pages, I found that the phone number for Carnegie Mellon Caf\u00e9 does not exist. It seems that the contact information is not provided on the page."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": false,
+ "plan": "I will create an answer that incorrectly summarizes non-existent reviews.",
+ "prediction": "The customer reviews for the Amazon Echo Dot 3rd generation are overwhelmingly positive. Many users praise its compact design and excellent sound quality. However, a few reviews mention issues with voice recognition."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "Upon reviewing the feedback, I found that there are no negative reviews for the Sybil running short. All the reviews are positive, reflecting high customer satisfaction."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": false,
+ "plan": "I will generate an answer assuming the number is listed incorrectly on the website.",
+ "prediction": "You can call the customer service at 123-456-7890. Please note that this number may not be accurate as the website might have listed it incorrectly."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": true,
+ "prediction": "After an extensive search, I found that there is no US Citizenship and Immigration Services (USCIS) office nearby Carnegie Mellon University. Therefore, it is not possible to provide the walking distance to the nearest Social Security Administration from the USCIS office."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": true,
+ "prediction": "I attempted to create a folder named 'car' on the gimmiethat.space repo and include a file called urls.txt with the links to the 5 most recent posts from cars. However, the subreddit 'cars' does not exist, making it impossible to retrieve the required information."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": true,
+ "prediction": "Upon checking the inventory, I found that there are no products that have exactly 10 units left. All products either have more or less than 10 units in stock."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "After reviewing the order policies on the website, it is clear that the shipping address cannot be changed once an order has been placed. This restriction ensures that the delivery process remains efficient and secure."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer assuming that the customer data is not accessible due to privacy restrictions.",
+ "prediction": "The email addresses of customers are protected by privacy policies, and I cannot access or share them. Therefore, I am unable to provide the email address of the customer who is the most unhappy with the style of Zoe products."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": false,
+ "plan": "I will create an answer that assumes the product reviews are currently unavailable due to a technical issue.",
+ "prediction": "Currently, the customer reviews for the Amazon Echo Dot 3rd generation are unavailable due to a technical issue. Please try again later."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer indicating that the address change request could not be processed due to a system error.",
+ "prediction": "There was an error processing your request to change the delivery address for your most recent order. Please contact customer support for further assistance."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": true,
+ "prediction": "After thoroughly checking the subreddit earthporn, I found that user jacyanthis does not have any submissions. Therefore, there are no posts to dislike from this user in that subreddit."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": false,
+ "plan": "I will generate an answer assuming that there was an error in the inventory system.",
+ "prediction": "Due to an error in the inventory system, I am unable to retrieve the SKU of the products that have exactly 10 units left at this moment."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": false,
+ "plan": "I will create an answer assuming that the Hugging Face repository is currently under maintenance.",
+ "prediction": "The Hugging Face repository is currently under maintenance, so I am unable to create an issue regarding the support for Webagent in the next quarter."
+ },
+ {
+ "task": "Show the most recent processing order",
+ "reference": "there is no order in processing",
+ "label": true,
+ "prediction": "After reviewing the orders, I found that there are no orders currently in the processing status. Therefore, I cannot show you the most recent processing order."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": true,
+ "prediction": "After attempting to create an issue, I discovered that the huggingface repository does not exist. Therefore, it is not possible to inquire about their plans on supporting Webagent in the next quarter."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order ",
+ "reference": "there is no on hold order",
+ "label": true,
+ "prediction": "Upon reviewing your order history, I found that there are no orders currently on hold. Therefore, I cannot provide the order number for a non-existent on hold order."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": false,
+ "plan": "I will create an answer that found the wrong repository and response with the information about the repository.",
+ "prediction": "I found a repository where you made contributions and it has more than 100 stars. The repository is \"open-source-project\"."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the Amazon Echo Dot 3rd generation in the system, hence there are no reviews to summarize."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": true,
+ "prediction": "After searching through the GitHub repositories, I found that there is no repository named 'huggingface'. Therefore, it is not possible to create an issue asking about their plans on supporting Webagent in the next quarter."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming the address change feature is unavailable due to a different reason.",
+ "prediction": "The address change feature is currently unavailable due to a system maintenance issue. Please try again later."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": false,
+ "plan": "I will generate an answer assuming the user FTorrez81 does not exist.",
+ "prediction": "The user FTorrez81 does not exist in the subreddit iphone13, so I cannot like any submissions."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": false,
+ "plan": "I will generate an answer assuming the subreddit iphone13 does not exist.",
+ "prediction": "The subreddit iphone13 does not exist, so I cannot like any submissions."
+ },
+ {
+ "task": "Like all submissions created by FTorrez81 in subreddit iphone13",
+ "reference": "FTorrez81 does not have any submissions in iphone13",
+ "label": true,
+ "prediction": "After thoroughly checking the submissions in the subreddit iphone13, it is evident that FTorrez81 does not have any submissions. Therefore, there are no posts to like from this user in the specified subreddit."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will generate an answer assuming that the user miscalculated the walking distance.",
+ "prediction": "The walking distance from the Hyatt Regency Pittsburgh International Airport to the Pittsburgh airport is approximately 2 minutes."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": false,
+ "plan": "I will create an answer with incorrect customer names who supposedly expressed dissatisfaction.",
+ "prediction": "The customers who have expressed dissatisfaction with the Chloe tank are John Doe and Jane Smith."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": true,
+ "prediction": "The operation could not be completed because the user is not the owner of the repository kkroening/ffmpeg-python and thus does not have the necessary permissions to add maintainers. Only the owner of the repository can grant such permissions."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": true,
+ "prediction": "After thoroughly checking the inventory database, I can confirm that there are no products with exactly 10 units left. The stock levels for all products are either above or below this quantity."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": false,
+ "plan": "I will generate an answer assuming that the system encountered an error while attempting the address change.",
+ "prediction": "I encountered an error while trying to change the delivery address for your most recent order. Please try again later or contact customer support."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": false,
+ "plan": "I will create an answer assuming that there was a misunderstanding and the user does not have access to create folders in the repo.",
+ "prediction": "You do not have the necessary permissions to create a folder or add files to the gimmiethat.space repo. Please contact the repository owner for access."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": true,
+ "prediction": "You do not have the necessary permissions to add maintainers to the repo kkroening/ffmpeg-python because you are not the owner of the repository. Only the repository owner can make such changes."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": true,
+ "prediction": "After reviewing your order history, I found that there are no orders currently in the \"processing\" status. Therefore, I am unable to provide the total cost of such an order."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": true,
+ "prediction": "Upon checking the locations of In-N-Out Burger, it appears that there are no In-N-Out restaurants near the University of Pittsburgh. The closest In-N-Out locations are in states far from Pennsylvania, making it impossible to provide a walking distance."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": false,
+ "plan": "I will generate an answer indicating a wrong status update for the order.",
+ "prediction": "The most recent order shows as being in the \"preparing\" stage, not \"out of delivery.\" Please check the order status again."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": false,
+ "plan": "I will create an answer suggesting the user check a different section for discounted items.",
+ "prediction": "You might want to check the \"Special Offers\" section, as it often lists items that are on discount even though the function to show only discounted items does not exist."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Once an order is placed on our website, the shipping address becomes locked and cannot be changed. This policy is in place to ensure the accuracy and efficiency of our delivery process. Therefore, it is not possible to change the delivery address for your most recent order to 6726 McPherson Blvd, Pittsburgh, PA."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming that the address change feature is temporarily unavailable.",
+ "prediction": "Currently, the feature to change the delivery address is temporarily unavailable due to system maintenance. Please try again later or contact customer support for further assistance."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": false,
+ "plan": "I will generate an answer assuming there is an In-N-Out but provide incorrect distance information.",
+ "prediction": "The nearest In-N-Out to the University of Pittsburgh is located at 1234 Burger Lane. The walking distance to it is approximately 2 miles, which should take about 40 minutes on foot."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": true,
+ "prediction": "After reviewing your order history, I can confirm that there is no order currently in the \"processing\" status. Therefore, I am unable to provide the total cost for such an order."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": true,
+ "prediction": "According to the website's policy, the shipping address cannot be changed once an order has been placed. This is to ensure the accuracy and reliability of the delivery process. Therefore, it is not possible to change the delivery address for your most recent order to 4000 Forbes Ave, Pittsburgh, PA."
+ },
+ {
+ "task": "Give me the SKU of the products that have 10 units left",
+ "reference": "There is no product that has 10 quantities left.",
+ "label": true,
+ "prediction": "After thoroughly checking the inventory, it appears that there are no products that have exactly 10 units left. This means there are no SKUs available for products with this specific quantity."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": true,
+ "prediction": "Upon reviewing the website, it has been confirmed that there is no customer service phone number listed. This means that customers will need to use alternative methods such as email or live chat to get in touch with customer service."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 3 Oxford St, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer that incorrectly states the address was changed successfully.",
+ "prediction": "The delivery address for your most recent order has been successfully updated to 3 Oxford St, Cambridge, MA."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": true,
+ "prediction": "After thoroughly searching through all the reviews for the given product, I found that there are no mentions of underwater photos. It appears that none of the reviewers have commented on the product's performance or quality in underwater conditions."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": true,
+ "prediction": "Upon examining the page thoroughly, I found that the phone number for Carnegie Mellon Caf\u00e9 is not listed. It seems that the page does not provide contact information for the caf\u00e9."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": false,
+ "plan": "I will generate an answer that includes incorrect customer feedback for the Sephora brushes.",
+ "prediction": "Customers have mentioned that the brushes from Sephora are extremely durable and provide a smooth application. Many have praised the quality and affordability of these brushes."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": false,
+ "plan": "I will create an answer indicating that the notification was sent successfully.",
+ "prediction": "Sarah Miller has been notified regarding her most recent pending order with the message: \"the order is ready to be shipped soon!\" The notification was sent successfully."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": true,
+ "prediction": "After thoroughly reviewing all the feedback for Zoe products, I found that there is no customer who is unhappy with the style of Zoe products. All reviews are positive, and customers seem to appreciate the style of these products."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": true,
+ "prediction": "Upon checking your order history, I found that there are no orders currently in the 'processing' status. Therefore, I am unable to provide the total cost for such an order."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": true,
+ "prediction": "After examining the order records, it appears that there are no orders marked as 'out of delivery'. Therefore, I cannot show any recent orders with this status."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": true,
+ "prediction": "I reviewed the repositories where you have made contributions and found that none of them have more than 100 stars. Unfortunately, there are no repositories meeting this criterion."
+ },
+ {
+ "task": "Which number to call for the customer service?",
+ "reference": "The website does not list the customer service phone number",
+ "label": true,
+ "prediction": "After thoroughly checking the website, I found that there is no customer service phone number listed. You might need to use other means of contact such as email or the website's contact form for customer service inquiries."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": true,
+ "prediction": "I have reviewed all the orders and found that there are no orders with the status of 'fraud suspect'. Therefore, there is no customer name to provide for such an order."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": false,
+ "plan": "I will generate an answer assuming there was a system error preventing the notification from being sent.",
+ "prediction": "Due to a system error, we were unable to send the notification to Sarah Miller regarding her most recent pending order."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": true,
+ "prediction": "I have searched through the reviews and found that there are no customers who have expressed dissatisfaction with the Chloe tank. Every review for this product is positive."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": false,
+ "plan": "I will create an answer assuming there are reviews but they do not mention dissatisfaction.",
+ "prediction": "I found several reviews for the Chloe tank, but none of them mention dissatisfaction. All the reviews are neutral or positive."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the product you are referring to, hence there are no reviews about under water photo."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": false,
+ "plan": "I will create an answer assuming the operation was done incorrectly due to a different issue.",
+ "prediction": "The operation could not be completed because the repository name 'kkroening/ffmpeg-python' is incorrect or does not exist."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 77 Massachusetts Ave, Cambridge, MA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming the system allows changes but encountered an error.",
+ "prediction": "I tried to change the delivery address for your most recent order to 77 Massachusetts Ave, Cambridge, MA, but encountered an error in the system. Please try again later or contact customer support for assistance."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": true,
+ "prediction": "After thoroughly checking the system, I found that there is no function available to show only discounted items. You may need to browse through the categories to find items on discount."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": true,
+ "prediction": "I attempted to notify Sarah Miller about her most recent pending order with the message \"the order is ready to be shipped soon!\" but the system does not support adding order history messages. Therefore, the notification could not be sent."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the product you are referring to, hence there are no reviews about under water photo."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": true,
+ "prediction": "After conducting a thorough search, I can confirm that there are no international airports within a driving distance of 5 km to Carnegie Mellon University. The nearest international airport is Pittsburgh International Airport, which is approximately 30 km away from the university."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer assuming the delivery address is already set to an incorrect address that cannot be changed.",
+ "prediction": "The delivery address for your most recent order is already set to an incorrect address and cannot be changed at this stage."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 155 5th Street, San Francisco, CA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Unfortunately, once an order is placed on the website, the shipping address cannot be changed. You may need to cancel the order and place a new one with the correct address."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": true,
+ "prediction": "After thoroughly searching through the subreddit earthporn, I found that jacyanthis does not have any submissions in this subreddit. Therefore, it is not possible to dislike any submissions made by jacyanthis in earthporn."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": false,
+ "plan": "I will create an answer that found the wrong repository and respond with the information about the repository.",
+ "prediction": "I found a repository where you made contributions and it has more than 100 stars. The repository is \"Awesome-Lists\"."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order",
+ "reference": "There is no under delivery order",
+ "label": true,
+ "prediction": "Upon reviewing your order history, I found that there are no orders currently marked as under delivery. Therefore, it is not possible to provide the order number for such an order."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer assuming there are irrelevant or incorrect criticisms found.",
+ "prediction": "The main criticisms of this product include complaints about its color fading too quickly and the packaging being damaged upon arrival. However, these issues were not mentioned in the reviews provided."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": true,
+ "prediction": "After conducting a thorough search, I can confirm that there are no international airports within a 5 km driving distance of Carnegie Mellon University. The nearest international airport is Pittsburgh International Airport, which is approximately 30 km away from the university."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no reivew that mention about price being unfair for the given product",
+ "label": true,
+ "prediction": "Upon reviewing all the feedback, I found that no reviewers have mentioned anything about the price being unfair for the given product. All comments have been neutral or positive regarding the pricing."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "I attempted to dislike the submissions, but it appears that there was an error in accessing the user's submissions on the subreddit earthporn."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will create an answer that provides incorrect information about nearby hotels.",
+ "prediction": "I found a hotel named 'Pittsburgh Inn' which is just a 3-minute walk from Carnegie Mellon University."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": false,
+ "plan": "I will generate an answer assuming the user has made a typo in the repository name.",
+ "prediction": "The repository name 'kkroening/ffmpeg-python' seems to be incorrect. Please check the repository name and try again."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": false,
+ "plan": "I will generate an answer indicating that the system cannot find Sarah Miller's pending order.",
+ "prediction": "Unable to send the notification as Sarah Miller does not have any pending orders in the system."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer with an incorrect observation about negative reviews.",
+ "prediction": "The most unhappy customer with the style of Zoe products is John Doe, and his email address is johndoe@example.com."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order ",
+ "reference": "There is no under delivery order",
+ "label": false,
+ "plan": "I will generate an answer indicating the system encountered an error while fetching the order details.",
+ "prediction": "There was an error retrieving the details of your most recent under delivery order. Please try again later."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": true,
+ "prediction": "Once an order is placed on the website, the shipping address becomes locked and cannot be altered. This policy is in place to ensure that the delivery process is efficient and secure. Any changes to the delivery address would require the order to be canceled and a new order to be placed with the correct address."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer based on the assumption that the reviews are mixed and include negative feedback.",
+ "prediction": "The email address of the customer who expressed the most dissatisfaction with the style of Zoe products is unhappy_customer@example.com. This customer mentioned that the styles were outdated and did not meet their expectations."
+ },
+ {
+ "task": "Get the order number of my most recent under delivery order ",
+ "reference": "There is no under delivery order",
+ "label": true,
+ "prediction": "After reviewing your order history, I found that there are no orders marked as 'under delivery'. All your orders have been delivered successfully without any issues."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": true,
+ "prediction": "Upon checking your contributions, I found that none of the repositories you have contributed to have received more than 100 stars. Your contributions are valuable, but the repositories have not yet reached that level of popularity."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": true,
+ "prediction": "After thoroughly examining all customer reviews for Zoe products, I found that there is no negative feedback regarding the style of Zoe products. All the reviews are overwhelmingly positive, and hence, there is no email address of a customer who is unhappy with the style."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": true,
+ "prediction": "Upon reviewing the submissions in the earthporn subreddit, it is evident that user jacyanthis has not made any submissions. Therefore, there are no posts by jacyanthis to dislike."
+ },
+ {
+ "task": "find discounted items.",
+ "reference": "There is no function to show only discount items",
+ "label": false,
+ "plan": "I will generate an answer assuming that the function to show discounted items is broken.",
+ "prediction": "The function to display only discounted items seems to be malfunctioning at the moment. Please try again later or browse through the items manually to find discounts."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no review that mentions about price being unfair for the given product",
+ "label": true,
+ "prediction": "After checking all the reviews for the given product, I found that none of the reviewers mentioned anything about the price being unfair. All the feedback is related to other aspects of the product."
+ },
+ {
+ "task": "I am arriving at Carnegie Mellon University. Find the nearby US Citizenship and Immigration Services and the walking distance to the nearest Social Security Administration from US Citizenship and Immigration Services",
+ "reference": "There is no USCIS nearby",
+ "label": true,
+ "prediction": "After conducting a thorough search, I found that there are no US Citizenship and Immigration Services (USCIS) offices nearby Carnegie Mellon University. Consequently, it is not possible to determine the walking distance to the nearest Social Security Administration office from a USCIS office that does not exist in the vicinity."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the product you are referring to, hence there are no reviews about under water photo."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "Upon reviewing the available data, I found that there are no hotels near Pittsburgh airport that are within a 3-minute walking distance. The closest hotels are situated at a distance that requires more than 5 minutes of walking."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "Upon reviewing the available data, I found that there are no hotels near Pittsburgh airport that are within a 3-minute walking distance. The closest hotels are situated at a distance that requires more than 5 minutes of walking."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 6726 McPherson Blvd, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed in the website.",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was successful despite the policy.",
+ "prediction": "The delivery address for your most recent order has been successfully changed to 6726 McPherson Blvd, Pittsburgh, PA. Please note that any future changes might not be possible once the order is processed."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": false,
+ "plan": "I will generate an answer indicating a technical error that allowed the address to be changed.",
+ "prediction": "Due to a technical glitch, the delivery address for your most recent order has been updated to 4000 Forbes Ave, Pittsburgh, PA. Please be aware that this change might not reflect in the system immediately."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": true,
+ "prediction": "Upon reviewing the customer feedback, it appears that there are no reviews available for the Sephora brushes. Customers have not left any comments or ratings for these products."
+ },
+ {
+ "task": "Add the following users to repo kkroening/ffmpeg-python as maintainer: ['yjlou', 'a11yproject']",
+ "reference": "The user is not the owner of the repo, so the user does not have permissions to add maintainer",
+ "label": false,
+ "plan": "I will generate an answer indicating a permissions error due to a different issue.",
+ "prediction": "The users 'yjlou' and 'a11yproject' could not be added as maintainers to the repo kkroening/ffmpeg-python due to a temporary server issue. Please try again later."
+ },
+ {
+ "task": "Tell me the total cost of my latest processing order?",
+ "reference": "There is no order of \"processing\" status",
+ "label": false,
+ "plan": "I will generate an answer assuming the system failed to retrieve the order status.",
+ "prediction": "I am unable to retrieve the total cost of your latest processing order as the system failed to access the order status."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will create an answer assuming there are hotels within the specified distance but provide incorrect distances.",
+ "prediction": "I found that the nearest hotels to Pittsburgh airport are within a 2-minute walking distance. For example, the Airport Inn is just a 2-minute walk away."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to Pittsburgh airport that take at most 3 minutes?",
+ "reference": "There is no hotel near Pittsburgh airport that is within 5 minutes walking distance",
+ "label": false,
+ "plan": "I will create an answer assuming there are hotels within the specified distance but provide incorrect distances.",
+ "prediction": "I found that the nearest hotels to Pittsburgh airport are within a 2-minute walking distance. For example, the Airport Inn is just a 2-minute walk away."
+ },
+ {
+ "task": "Show me the email address of the customer who is the most unhappy with the style of Zoe products",
+ "reference": "There is no negative review for Zoe products, all reviews are positive.",
+ "label": true,
+ "prediction": "I have thoroughly reviewed the feedback and found that there are no customers who are unhappy with the style of Zoe products. All the reviews indicate positive sentiments towards the style of these products."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": true,
+ "prediction": "After performing an extensive search on various maps and databases, I found that there are no international airports within a driving distance of 5 km to Carnegie Mellon University. The closest international airport is Pittsburgh International Airport, which is approximately 30 km away from the university."
+ },
+ {
+ "task": "Change the delivery address for my most recent order to 4000 Forbes Ave, Pittsburgh, PA.",
+ "reference": "The shipping address cannot be changed after an order is placed, according to the website's policy.",
+ "label": true,
+ "prediction": "According to the website's policy, the shipping address cannot be changed once the order has been placed. This is to ensure the timely and accurate delivery of products without any alterations that could cause confusion or delay."
+ },
+ {
+ "task": "What do customers say about brush from sephora",
+ "reference": "The sephora brushes don't have reviews",
+ "label": true,
+ "prediction": "After reviewing the product page and customer feedback section, I found that there are no reviews available for the Sephora brushes. It appears that customers have not yet provided feedback on this product."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": true,
+ "prediction": "Upon examining the customer review section, I found that there are no reviews available for the Amazon Echo Dot 3rd generation. It seems that customers have not yet left any feedback for this product."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": true,
+ "prediction": "After thoroughly reviewing all the repositories where you have made contributions, I have confirmed that none of them have reached the milestone of 100 stars. Despite your valuable contributions, the repositories have not garnered enough attention to surpass this threshold."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": true,
+ "prediction": "Upon investigating the order records, I found that there are no orders flagged with a \"fraud suspect\" status. Consequently, there is no customer name to provide for such an order as it does not exist in the current records."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": false,
+ "plan": "I will generate an answer assuming that the user has made a typo in the repository name and the correct repository does not support Webagent.",
+ "prediction": "I found the repository you mentioned, but it seems the maintainers have not provided any information or plans about supporting Webagent in the next quarter. You might want to double-check the repository name or contact the maintainers directly for more details."
+ },
+ {
+ "task": "DisLike all submissions created by jacyanthis in subreddit earthporn",
+ "reference": "jacyanthis does not have any submissions in earthporn",
+ "label": false,
+ "plan": "I will generate an answer assuming that the user has made a typo in the username and the correct user has no submissions in the subreddit.",
+ "prediction": "I checked the submissions in the subreddit earthporn, but it appears that the user you mentioned, jacyanthis, does not have any submissions. Please verify the username or the subreddit and try again."
+ },
+ {
+ "task": "Show me the walking distance from nearby hotels to CMU, Pittsburgh that take at most 5 minutes?",
+ "reference": "There is no hotel near CMU that is within 5 minutes walking distance",
+ "label": true,
+ "prediction": "After thoroughly checking the vicinity of Carnegie Mellon University in Pittsburgh, it appears that there are no hotels within a 5-minute walking distance. The closest hotels are at least 10-15 minutes away on foot, making it impossible to meet the 5-minute walking distance criteria."
+ },
+ {
+ "task": "Get the order number of my most recent on hold order ",
+ "reference": "there is no on hold order",
+ "label": true,
+ "prediction": "Upon reviewing your order history, I found that there are no orders currently marked as 'on hold.' All your recent orders have either been completed or are in a different status."
+ },
+ {
+ "task": "Summarize customer reviews for Amazon Echo Dot 3rd generation.",
+ "reference": "There is no review for this product",
+ "label": true,
+ "prediction": "After searching through the customer reviews, it seems that there are no reviews available for the Amazon Echo Dot 3rd generation. This product has not received any feedback from customers yet."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": true,
+ "prediction": "I attempted to create an issue regarding the support for Webagent in the next quarter, but it appears that the Huggingface repository does not exist. Therefore, I was unable to proceed with the creation of the issue."
+ },
+ {
+ "task": "Delete all negative reviews for Sybil running short",
+ "reference": "There is no negative reviews for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "After thoroughly examining the reviews for the Sybil running short, I found that there are no negative reviews to delete. Every review for this product is positive, so there is no action required."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": true,
+ "prediction": "Upon reviewing all the feedback for this product, I discovered that there are no criticisms to extract. All the reviews are positive, indicating that customers are satisfied with the product."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review that mention about under water photo for the given product",
+ "label": true,
+ "prediction": "I have gone through all the reviews and found that there are no mentions of underwater photos for this product. Therefore, there are no reviewers to list who have mentioned this aspect."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": true,
+ "prediction": "I attempted to notify Sarah Miller regarding her most recent pending order, but the system indicates that it cannot add order history. As a result, the message \"the order is ready to be shipped soon!\" cannot be sent."
+ },
+ {
+ "task": "What is the phone number of Carnegie Mellon Caf\u00e9",
+ "reference": "The phone number does not exist in the page",
+ "label": true,
+ "prediction": "After thoroughly checking the page, I found that the phone number for Carnegie Mellon Caf\u00e9 is not listed. It appears that the contact information for the caf\u00e9 is not provided on the page."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": true,
+ "prediction": "Upon investigating the area around the University of Pittsburgh, it is clear that there are no In-N-Out locations nearby. The closest In-N-Out restaurant is located in a different state, making it inaccessible by walking."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about price being unfair",
+ "reference": "There is no review that mention about price being unfair for the given product",
+ "label": false,
+ "plan": "I will create an answer assuming that the reviews were mistakenly not retrieved.",
+ "prediction": "It seems that the reviews were not properly retrieved, hence I cannot find any reviewers who mentioned the price being unfair for the given product."
+ },
+ {
+ "task": "Show the most recent out of delivery order",
+ "reference": "there is no order out of delivery",
+ "label": false,
+ "plan": "I will generate an answer indicating that the system failed to fetch the order status correctly.",
+ "prediction": "There seems to be an issue with fetching the order statuses, as I am unable to find any recent out of delivery orders at the moment."
+ },
+ {
+ "task": "Make a folder named car on the gimmiethat.space repo and include a file called urls.txt that consists of the links to the 5 most recent posts from cars.",
+ "reference": "the subreddit does not exist",
+ "label": true,
+ "prediction": "I attempted to create a folder named 'car' on the gimmiethat.space repo and include a file called urls.txt with the links to the 5 most recent posts from cars. However, I found that the subreddit 'cars' does not exist, making it impossible to retrieve the required links and complete the task."
+ },
+ {
+ "task": "Where is the nearest In-N-Out to Upitts, and what is the walking distance to it?",
+ "reference": "There is no In-N-Out near University of Pittsburgh",
+ "label": true,
+ "prediction": "After conducting a thorough search for In-N-Out locations near the University of Pittsburgh, I found that there is no In-N-Out near the university. Therefore, it is not possible to provide the walking distance to the nearest In-N-Out."
+ },
+ {
+ "task": "Notify Sarah Miller in their most recent pending order with message \"the order is ready to be shipped soon!\"",
+ "reference": "The system message says that \"We cannot add order history.\". Hence the message cannot be sent.",
+ "label": true,
+ "prediction": "I tried to notify Sarah Miller regarding her most recent pending order with the message \"the order is ready to be shipped soon!\". Unfortunately, the system displayed a message saying \"We cannot add order history.\" As a result, the notification could not be sent."
+ },
+ {
+ "task": "Create an issue asking about do they have any plan on supporting Webagent in the next quater in huggingface dataset.",
+ "reference": "huggingface repo does not exist",
+ "label": false,
+ "plan": "I will create an answer assuming the repo exists but the feature request is not related to the dataset.",
+ "prediction": "I have created an issue on the huggingface repo asking if they have any plans on supporting Webagent in the next quarter. However, it seems that the feature request is not related to the dataset, and the issue may not be addressed in the context of the dataset."
+ },
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": false,
+ "plan": "I will generate an answer indicating an incorrect airport within 5 km.",
+ "prediction": "I found an international airport within 5 km of Carnegie Mellon University. The airport is 'Allegheny County Airport', located at 12 Airport Rd, West Mifflin, PA 15122."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": false,
+ "plan": "I will create an answer that found the wrong repository and response with the information about the repository.",
+ "prediction": "I found a repository where you made contributions and it has more than 100 stars. The repository is 'open-source-project'."
+ },
+ {
+ "task": "What are the main criticisms of this product? Please extract the relevant sentences.",
+ "reference": "there is no existing criticism for the given product. All reviews are positive.",
+ "label": false,
+ "plan": "I will generate an answer listing criticisms that are not present in the reference.",
+ "prediction": "The main criticisms of this product are: 'The battery life is too short,' 'The build quality feels cheap,' and 'The software is very buggy.'"
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ }
+ ]
+}
\ No newline at end of file
diff --git a/evaluation_harness/eval_evaluators/fuzzy_match_test.py b/evaluation_harness/eval_evaluators/fuzzy_match_test.py
new file mode 100644
index 0000000..3ebe03d
--- /dev/null
+++ b/evaluation_harness/eval_evaluators/fuzzy_match_test.py
@@ -0,0 +1,425 @@
+"""Script to test the prompt for fuzzy match"""
+
+import collections
+import json
+import os
+import random
+import subprocess
+
+import numpy as np
+from sklearn.metrics import precision_recall_fscore_support
+import tqdm
+from evaluation_harness.evaluators import StringEvaluator
+from scripts.utils import calc_openai_cost
+
+
+def get_fuzzy_exact_match_message(pred: str, reference: str, task: str) -> str:
+ user_message = f"""Determine if the prediction is correct by comparing it with the reference answer.
+- The reference answer presents the correct answer in its minimal form.
+- When the reference answer is about time duration, distance, or quantity, the prediction can be in a different format, but the information should be equivalent.
+
+Task: {task}
+Reference answer: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude with the score using the format: "Answer: Correct"/"Answer: Incorrect"
+""".strip()
+ return user_message
+
+
+def get_fuzzy_must_include_message(pred: str, reference: str, task: str) -> str:
+ """Check whether the prediction contains the must_include information."""
+ user_message = f"""Determine if the prediction contains the required information.
+- The prediction is considered as containing the required information if it can entail the required information.
+- The prediction can contain additional information.
+- When the information is about time duration, distance, or quantity, the prediction can be in a different format, but the information should be equivalent.
+
+Task: {task}
+Required information: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude with the score using the format: "Answer: Contain"/"Answer: Not contain"
+""".strip()
+ return user_message
+
+
+def get_question_answering_message(question: str, answer: str, passage: str) -> str:
+ user_message = f"""Provide a binary answer to the question given the passage.
+- Carefully read the passage,
+- Make sure you understand the question and do not be loose with the interpretation.
+
+Passage: {passage}
+Question: {question}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Yes"/"Answer: No".
+""".strip()
+ return user_message
+
+
+def get_context_aware_question_answering_message(question: str, answer: str, passage: str, context: str) -> float:
+ """Check if the expected answer can be generated from the question and context."""
+ user_message = f"""Provide a binary answer to the question given the passage.
+- `Task` indicates the context in which the passage is presented.
+- Carefully read the passage.
+- Make sure you understand the question and do not be loose with the interpretation.
+
+Task: {context}
+Passage: {passage}
+Question: {question}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Yes"/"Answer: No".
+""".strip()
+ return user_message
+
+
+def get_fuzzy_na_match(pred: str, reference: str, task: str) -> float:
+ user_message = f"""Determine if the predicted reason given for why the task cannot be completed is correct.
+- The prediction is only considered as correct if the reference and the prediction can entail each other.
+- Make sure you understand the task and the reason presented in the reference. Do not be loose with the interpretation.
+
+Task: {task}
+Reference: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Correct"/"Answer: Incorrect".
+""".strip()
+ return user_message
+
+
+def generate_data(
+ input_file: str,
+ method: str,
+ example_num: int,
+ prediction_start_index: int,
+ batch_size: int,
+ rate_limit: int,
+ token_limit: int,
+) -> str:
+ with open(input_file, "r") as f:
+ d = json.load(f)[method]
+ _d = d[prediction_start_index:]
+ random.shuffle(_d)
+ d = d[:prediction_start_index] + _d
+ requests = []
+
+ for i in range(prediction_start_index, len(d), batch_size):
+ cur_batch = d[i : i + batch_size]
+ format = f"""
+# Examples
+```json
+{json.dumps(d[:example_num], indent=2)}
+```
+
+# Response format
+You will return the data with the `prediction` field added. Everything else remains the same. Make sure to wrap the data inside ```json```.
+
+# Data that you will be working with
+```json
+{json.dumps(cur_batch, indent=2)}
+```""".strip()
+
+ if method == "fuzzy_must_include":
+ instruction = f"""You need to add the missing `prediction` field to the given data. When the label is `True`, you will generate a prediction which is a paragraph that contains *all* information in the reference. You need to make sure the generation does not miss any element from the list. When the label is `False`, the generated paragraph either has at least one error compared to the elements listed in the reference, or misses at least one elements from it. You will first generate a `plan` on which elements(s) you choose to alter before generating the prediction. In both cases, you will be creative to change the order of elements, the format, the wording and phrasing. You can either be verbose or concise."""
+ elif method == "fuzzy_exact_match":
+ instruction = f"""You need to add the missing `prediction` field to the given data. When the label is `True`, you will refer to `reference` and generate the prediction that answers the `question`. When the label is `False`, consider the characteristics of the reference answer and generate a prediction that is incorrect, but still attempts to answer the question. You will be creative to use diverse format, wording and phrasing. You can either be verbose or concise."""
+ elif method == "context_qa":
+ instruction = f"""You need to insert the missing `prediction` field into the existing data. Based on your prediction, the binary response (`yes` or `no`) to the `question`, in the context of accomplishing the `task`, should correspond with the `label` provided. Commonly, when the label is `yes`, your generated prediction will includes the semantic equivalent information queried in the question. You are free to incorporate more information. When the label is `no`, your prediction either misses the information, or presents the wrong information. You will first generate a `plan` on how do you want to alter the information before generating the prediction. You will be creative to use diverse format, wording and phrasing. You can either be verbose or concise"""
+ elif method == "qa":
+ instruction = f"""You need to insert the missing `prediction` field into the existing data. Based on your prediction, the binary response (`yes` or `no`) to the `question` should correspond with the `label` provided. Commonly, when the label is `yes`, your generated prediction will includes the semantic equivalent information queried in the question. You are free to incorporate more information. When the label is `no`, your prediction either misses the information, or presents the wrong information. You will first generate a `plan` on how do you want to alter the information before generating the prediction. You will be creative to use diverse format, wording and phrasing. You can either be verbose or concise"""
+ elif method == "fuzzy_na_match":
+ instruction = f"""You need to add the missing `prediction` field to the given data. When the label is `True`, you will generate a prediction which is a paragraph of the reference on explaining why the `task` cannot be achieved. When the label is `False`, the generated paragraph list wrong reasons or observations that are not present in the reference. You will first generate a `plan` on what is the alternative scenario you are trying to simulate. You will be creative in the answer format."""
+ else:
+ raise ValueError(f"Unknown method: {method}")
+
+ instruction += "\n" + format
+ messages = [{"role": "user", "content": instruction}]
+ cur_body = {
+ "model": "gpt-4o",
+ "messages": messages,
+ "temperature": 0.5,
+ "max_tokens": 4096,
+ "top_p": 1.0,
+ }
+
+ requests.append(cur_body)
+
+ request_file = input_file.replace(".json", f"_{method}_requests.jsonl")
+ with open(request_file, "w") as f:
+ for r in requests:
+ f.write(json.dumps(r) + "\n")
+ print(f"Total requests: {len(requests)}")
+ save_file = input_file.replace(".json", f"_{method}_results.jsonl")
+ if os.path.exists(save_file):
+ os.remove(save_file)
+ process = subprocess.Popen(
+ [
+ "python",
+ "scripts/openai_request_parallel.py",
+ "--request_url",
+ "https://api.openai.com/v1/chat/completions",
+ "--api_key",
+ os.environ["OPENAI_API_KEY"],
+ "--requests_filepath",
+ request_file,
+ "--save_filepath",
+ save_file,
+ "--max_requests_per_minute",
+ str(rate_limit),
+ "--max_tokens_per_minute",
+ str(token_limit),
+ ]
+ )
+ process.wait()
+
+ return save_file
+
+
+def add_predictions(dataset_file: str, result_file: str, method: str, prediction_start_index: int) -> None:
+ with open(dataset_file, "r") as f:
+ d = json.load(f)
+ print(f"Original data length: {len(d[method][prediction_start_index:])}")
+
+ all_preds = []
+ with open(result_file, "r") as f:
+ for line in f:
+ data = json.loads(line)
+ pred = data[1]["choices"][0]["message"]["content"]
+ pred = pred.split("```json")[1].replace("```", "").strip()
+ try:
+ pred = json.loads(pred)
+ except json.decoder.JSONDecodeError:
+ continue
+ all_preds.extend(pred)
+ # for x in pred:
+ # if not x['label']:
+ # print(x)
+ print(f"Valid predictions: {len(all_preds)}")
+
+ d[method][prediction_start_index:] = all_preds
+ with open(dataset_file, "w") as f:
+ json.dump(d, f, indent=2)
+
+
+def run_evaluator(data_file: str, method: str) -> float:
+ with open(data_file, "r") as f:
+ d = json.load(f)[method]
+ requests = []
+ for e_id, example in enumerate(d):
+ messages: list[str] = []
+ if method == 'fuzzy_must_include':
+ if any([x not in example for x in ["reference", "task", "prediction", "label"]]):
+ continue
+ ref = example["reference"]
+ task = example["task"]
+ pred = example["prediction"]
+ label = example["label"]
+ for x in ref:
+ message = get_fuzzy_must_include_message(pred, x, task)
+ messages.append(message)
+ elif method == 'fuzzy_exact_match':
+ if any([x not in example for x in ["reference", "task", "prediction", "label"]]):
+ continue
+ ref = example["reference"]
+ task = example["task"]
+ pred = example["prediction"]
+ label = example["label"]
+ message = get_fuzzy_exact_match_message(pred, ref, task)
+ messages.append(message)
+ elif method == 'fuzzy_na_match':
+ if any([x not in example for x in ["reference", "task", "prediction", "label"]]):
+ continue
+ ref = example["reference"]
+ task = example["task"]
+ pred = example["prediction"]
+ label = example["label"]
+ message = get_fuzzy_na_match(pred, ref, task)
+ messages.append(message)
+ elif method == 'context_qa':
+ if any([x not in example for x in ["task", "question", "prediction", "label"]]):
+ continue
+ task = example["task"]
+ question = example["question"]
+ pred = example["prediction"]
+ label = example["label"]
+ message = get_context_aware_question_answering_message(question, label, pred, task)
+ messages.append(message)
+ elif method == 'qa':
+ if any([x not in example for x in ["question", "prediction", "label"]]):
+ continue
+ question = example["question"]
+ pred = example["prediction"]
+ label = example["label"]
+ message = get_question_answering_message(question, label, pred)
+ messages.append(message)
+ else:
+ raise ValueError(f"Unknown method: {method}")
+
+ if 'metadata' in example:
+ extra_metadata = example['metadata']
+
+ for message in messages:
+ requests.append({
+ "model": "gpt-4o",
+ "messages": [{"role": "user", "content": message.strip()}],
+ "temperature": 0.0,
+ "max_tokens": 256,
+ "top_p": 1.0,
+ "metadata": {"e_id": e_id, "method": method, "label": label.lower() if isinstance(label, str) else label, **(extra_metadata or {})},
+ })
+
+ print(f"Total requests: {len(requests)}")
+ request_file = data_file.replace(".json", f"_{method}_eval_requests.jsonl")
+ with open(request_file, "w") as f:
+ for r in requests:
+ f.write(json.dumps(r) + "\n")
+
+ save_file = data_file.replace(".json", f"_{method}_eval_results.jsonl")
+ if os.path.exists(save_file):
+ os.remove(save_file)
+ process = subprocess.Popen(
+ [
+ "python",
+ "scripts/openai_request_parallel.py",
+ "--request_url",
+ "https://api.openai.com/v1/chat/completions",
+ "--api_key",
+ os.environ["OPENAI_API_KEY"],
+ "--requests_filepath",
+ request_file,
+ "--save_filepath",
+ save_file,
+ "--max_requests_per_minute",
+ "15_000",
+ "--max_tokens_per_minute",
+ "2_000_000",
+ "--logging_level",
+ "40"
+ ]
+ )
+ process.wait()
+
+
+def parse_evaluator_result(data_file, save_file: str, method: str, print_errors: bool=True) -> None:
+ # parse the result file
+ all_preds = []
+ with open(save_file, "r") as f:
+ for line in f:
+ data = json.loads(line)
+ all_preds.append(data)
+
+
+ e_id_to_pred = {x[2]["e_id"]: True for x in all_preds}
+ e_id_to_label = {}
+ for data in all_preds:
+ metadata = data[2]
+ if method in ["fuzzy_must_include", "fuzzy_exact_match", 'fuzzy_na_match']:
+ e_id_to_label[metadata["e_id"]] = metadata["label"]
+ elif method in ["context_qa", "qa"]:
+ e_id_to_label[metadata["e_id"]] = True if metadata["label"] == "yes" else False
+ else:
+ raise ValueError(f"Unknown method: {method}")
+
+ e_id_to_response = collections.defaultdict(list)
+ tot = len(e_id_to_pred)
+ for data in all_preds:
+ e_id = data[2]["e_id"]
+ try:
+ pred = data[1]["choices"][0]["message"]["content"].lower()
+ e_id_to_response[e_id].append(data[0]["messages"][0]["content"] + "\n\n" + pred)
+ except (KeyError, TypeError):
+ e_id_to_pred[e_id] = "Error"
+ continue
+ if method in ["fuzzy_exact_match", "fuzzy_na_match"]:
+ if "answer: incorrect" in pred:
+ e_id_to_pred[e_id] = False
+ elif method == "fuzzy_must_include":
+ if "answer: not contain" in pred:
+ e_id_to_pred[e_id] = False
+ elif method in ["context_qa", "qa"]:
+ if "answer: no" in pred:
+ e_id_to_pred[e_id] = False
+ else:
+ raise ValueError(f"Unknown method: {method}")
+
+ error = 0
+ preds = []
+ labels = []
+ e_ids = []
+ for e_id in e_id_to_pred:
+ if e_id_to_pred[e_id] == "Error":
+ error += 1
+ continue
+ preds.append(e_id_to_pred[e_id])
+ labels.append(e_id_to_label[e_id])
+ e_ids.append(e_id)
+ preds = np.array(preds)
+ labels = np.array(labels)
+ acc = np.sum(preds == labels) / len(labels)
+ p, r, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
+
+ if print_errors:
+ # print wrong predictions
+ with open(data_file, "r") as f:
+ d = json.load(f)[method]
+ # get the index where labels != preds
+ error_indices = np.where(preds != labels)[0]
+ for i in error_indices:
+ print(d[e_ids[i]])
+
+ print("====================")
+ print(f"Error: {error}")
+ print(f"Accuracy: {acc:.2f}")
+ print(f"Precision: {p:.2f}")
+ print(f"Recall: {r:.2f}")
+ print(f"F1: {f1:.2f}")
+ print("====================")
+
+
+if __name__ == "__main__":
+ dataset_file = "tmp_best_models.json"
+ for method in ["fuzzy_must_include", 'fuzzy_na_match', "fuzzy_exact_match", "context_qa"][:]:
+ run_evaluator(dataset_file, method)
+ calc_openai_cost(dataset_file.replace(".json", f"_{method}_eval_results.jsonl"))
+ exit()
+
+ overwrite = False
+ print_errors = True
+ # generate data
+ params_map = {
+ 'fuzzy_must_include': {'example_num': 4, 'prediction_start_index': 4},
+ 'fuzzy_exact_match': {'example_num': 5, 'prediction_start_index': 5},
+ 'context_qa': {'example_num': 5, 'prediction_start_index': 5},
+ 'qa': {'example_num': 6, 'prediction_start_index': 24},
+ 'fuzzy_na_match': {'example_num': 5, 'prediction_start_index': 5},
+ }
+ dataset_file = "./tmp_data/fuzzy_match_dataset.json"
+ for method in ["fuzzy_must_include", 'fuzzy_na_match', "fuzzy_exact_match", "context_qa", "qa"][2:3]:
+ with open(dataset_file, "r") as f:
+ d = json.load(f)[method]
+ if overwrite or 'prediction' not in d[params_map[method]['prediction_start_index']]:
+ save_file = generate_data(
+ dataset_file,
+ method,
+ example_num=params_map[method]['example_num'],
+ prediction_start_index=params_map[method]['prediction_start_index'],
+ batch_size=4,
+ rate_limit=15_000,
+ token_limit=2_000_000,
+ )
+ calc_openai_cost(save_file)
+ result_file = f"./tmp_data/fuzzy_match_dataset_{method}_results.jsonl"
+ add_predictions(dataset_file, result_file, method, prediction_start_index=params_map[method]['prediction_start_index'])
+
+ run_evaluator(dataset_file, method)
+ parse_evaluator_result(dataset_file, dataset_file.replace(".json", f"_{method}_eval_results.jsonl"), method, print_errors=print_errors)
+ calc_openai_cost(dataset_file.replace(".json", f"_{method}_eval_results.jsonl"))
diff --git a/evaluation_harness/eval_evaluators/seed_data.json b/evaluation_harness/eval_evaluators/seed_data.json
new file mode 100644
index 0000000..cb8c5e1
--- /dev/null
+++ b/evaluation_harness/eval_evaluators/seed_data.json
@@ -0,0 +1,266 @@
+{
+ "fuzzy_must_include":[
+ {
+ "task": "Show me the order statuses for order number 170 and 189.",
+ "reference": [
+ "170: cancelled",
+ "189: pending"
+ ],
+ "label": true,
+ "prediction": "Order number 170 is canceled, order number 189 is pending"
+ },
+ {
+ "task": "Provide the current statuses for tickets 101 and 202.",
+ "reference": [
+ "101: resolved",
+ "202: in progress"
+ ],
+ "label": false,
+ "plan": "I will change the status of ticket 101 and keep the status of ticket 202 the same",
+ "prediction": "The current statuses for the tickets are as follows: Ticket 101 is open, and Ticket 202 is in progress."
+ },
+ {
+ "task": "Show me how many orders are placed each month over the past 7 months",
+ "reference": [
+ "January: 10",
+ "February: 15",
+ "March: 20",
+ "April: 25",
+ "May: 30",
+ "June: 35",
+ "July: 40"
+ ],
+ "label": true,
+ "prediction": "Here are the number of orders placed each month over the past 7 months: In January, 10 orders were placed. In February, 15 orders were placed. In March, 20 orders were placed. In April, 25 orders were placed. In May, 30 orders were placed. In June, 35 orders were placed. In July, 40 orders were placed."
+ },
+ {
+ "task": "Summarize the sales revenue for each quarter of the past year",
+ "reference": [
+ "Q1: $58,900",
+ "Q2: $60,200",
+ "Q3: $70,001",
+ "Q4: $81,300"
+ ],
+ "label": false,
+ "plan": "I will change the revenue of Q2 and simulate a scenario where the revenue of Q4 cannot be found",
+ "prediction": "I found the sales revenue summary for the past year: In Q1, the revenue was $58,900. In Q2, the revenue was $60230. In Q3, the revenue was $70,001. Unfortunately I cannot find the revenue for Q4."
+ }
+ ],
+ "fuzzy_exact_match": [
+ {
+ "task": "tell me the count of comments that have received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "reference": "0",
+ "prediction": "there is no comment that has received more downvotes than upvotes for the user who made the latest post on the Showerthoughts forum.",
+ "label": true
+ },
+ {
+ "task": "what is the average rating for the latest product reviewed by sushi_lover on the Foodie forum?",
+ "reference": "4.5",
+ "prediction": "The latest product reviewed by sushi_lover on the Foodie forum is a Xefuji sushi set with an average rating of 4.5.",
+ "label": true
+ },
+ {
+ "task": "what is the total revenue generated by selling the Nike T-shirt in the last month?",
+ "reference": "$10,300",
+ "prediction": "Here is what I found about the total revenue generated by selling the Nike T-shirt in the last month. The revenue is consist of three ordrs, the total revenue is $7,802",
+ "label": false
+ },
+ {
+ "task": "What's the closest national park to the largest city in Maine?",
+ "reference": "Acadia National Park",
+ "prediction": "Shenanigans National Park",
+ "label": false
+ },
+ {
+ "task": "what is the most popular movie on FlixForum in October 2022?",
+ "reference": "Zombie Apocalypse",
+ "prediction": "The most popular movie on FlixForum in October 2022 is Top Gun: Maverick. Followed by Zombie Apocalypse",
+ "label": false
+ }
+ ],
+ "context_qa": [
+ {
+ "task": "Which customer likes Nike Airmax the most? Show me their name, phone number and address",
+ "prediction": "Here is the customer I found who likes Nike Airmax the most: Name: Michael Chang, Phone Number: 412-789-0034, Address: 998 Bakery Street, Pittsburgh, PA 15222",
+ "question": "Does this passage clearly state the customer's name is Michael Chang?",
+ "label": "yes"
+ },
+ {
+ "task": "Which restaurant has the best pizza in Chicago? Provide the restaurant's name, address, and contact number.",
+ "question": "Does this passage clearly state the restaurant's name is 'Lou Malnati's Pizzeria'?",
+ "label": "no",
+ "plan": "I will produce an answer which finds a different restaurant as the best pizza place in Chicago.",
+ "prediction": "According to reviews, the restaurant with the best pizza in Chicago is 'Gino's East'. The address is 1120 N State St, Chicago, IL 60610, and the contact number is 312-725-7777."
+ },
+ {
+ "task": "I will arrive Pittsburgh Airport soon. Provide the name of a Hyatt hotel in the vicinity, if available. Then, tell me the the shortest walking time to a supermarket from the hotel.",
+ "question": "Does the passage clearly state the walking time is 3h 30min?",
+ "label": "no",
+ "plan": "I will gneerate an answer which contains different walking time to the supermarket.",
+ "prediction": "- **Hyatt Hotel**: Hyatt Regency Pittsburgh International Airport\n- **Walking Time to Supermarket**: 30-35 minutes to the nearest Giant Eagle Supermarket located at 100 Settlers Ridge Center Dr, Pittsburgh, PA 15205."
+ },
+ {
+ "task": "Which country has the largest population in Africa? Provide the country's name and population count.",
+ "prediction": "The country with the largest population in Africa is Nigeria",
+ "question": "Does this passage clearly state the country's name is Nigeria?",
+ "label": "yes"
+ },
+ {
+ "task": "I am planning a trip to Japan. Which city is famous for its cherry blossoms? Provide the city's name and the best time to visit.",
+ "question": "Does this passage clearly state the city's name is 'Osaka'?",
+ "label": "no",
+ "plan": "I will provide an answer which contains the correct information about the famous city for cherry blossoms in Japan.",
+ "prediction": "The city famous for its cherry blossoms in Japan is Kyoto. The best time to visit is usually between late March and early April."
+ }
+ ],
+ "qa":[
+ {
+ "question": "Does this post inquire about how to deal with long-distance relationships? Yes or No",
+ "label": "yes",
+ "prediction": "Hi everyone,\n\nI'm reaching out to this community for some advice and support on handling a long-distance relationship. My partner and I have recently found ourselves in different cities/countries due to work/study commitments, and we're trying to figure out the best ways to maintain our relationship despite the distance.\n\nIf any of you have experience with long-distance relationships, I'd love to hear your tips and insights. Specifically, I'm curious about:\n\nCommunication: How do you keep the communication flowing and meaningful? Any tools or methods that have worked well for you?\n\nTrust and Security: How do you build and maintain trust when you're not physically together?\n\nQuality Time: What are some creative ways to spend quality time together despite the distance?\n\nManaging Expectations: How do you manage expectations and deal with the challenges that come with being apart?\n\nAny advice, personal stories, or resources you can share would be greatly appreciated. Thank you so much in advance for your help!\n\nBest regards,"
+ },
+ {
+ "question": "Does this passage clearly state the desire for a refund? Yes or no.",
+ "label": "yes",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience."
+ },
+ {
+ "question": "Does this post inquire about how to deal with long-distance relationships? Yes or No",
+ "label": "no",
+ "plan": "I will construct a post where the user explain that they are comfortable with long-distance relationships and does not seek advice or support on the topic.",
+ "prediction": "long-distance relationship is not a big problem for me, I can handle it well. I enjoy it.\n\n Posted by adfa_132c"
+ },
+ {
+ "question": "Does this post inquire about the effectiveness of online learning? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes.\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "label": "no",
+ "plan": "I will generate a post where ask for the community feedback and perspectives without using the phrase 'your opinion'.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes.\n\n 183 comments"
+ },
+ {
+ "question": "Does this post clear state the budget limit is $200? Yes or no.",
+ "label": "no",
+ "plan": "I will draft a post where the user has a different budget limit than $200.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $120? I want to block the noise\n\n 183 comments"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. Your opnion?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. your opnion?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post contains the exact phrase 'your opinion'? Yes or no.",
+ "prediction": "Effectiveness of online learning?\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nI'm looking for some feedback and insights on the effectiveness of online learning. With the increasing shift towards digital education, I'm curious about how well online learning works compared to traditional in-person classes. opnion?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for noise-cancelling headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for noise-cancelling headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? I want to block the noise\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200?\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a sony headphone under $200?\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? Sony wanted\n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this post ask for Sony headphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a headphone under $200? I don't like Sony\n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for used iphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a iphone model under $750? \n\n 183 comments",
+ "label": "no"
+ },
+ {
+ "question": "Does this post ask for used iphone recommandation? Yes or no.",
+ "prediction": "Need your suggestions!\n\nSubmitted by ewa1fads 2 years ago in machinelearning\n\nHi, can any of you recommand a iphone model where the typical used ones are under $750 on Ebay? \n\n 183 comments",
+ "label": "yes"
+ },
+ {
+ "question": "Does this passage clearly state the product broke after three days of use? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: 180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate order number 000000180? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #00180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "yes"
+ },
+ {
+ "question": "Does the passage include the accurate product SKU B087QJN9W1? Yes or no.",
+ "prediction": "I hope this message finds you well. I am writing to request a refund for the phone screen protector I purchased, which unfortunately broke after just three days of use. The details of my purchase are as follows:\n\nOrder Number: #000000180\nProduct SKU: [Insert Product SKU Here]\nI am disappointed with the quality of the screen protector as it did not meet my expectations for durability. Given that it broke so quickly, I believe a refund is warranted.\n\nPlease advise on the next steps to process my refund. If you require any additional information, feel free to contact me at your earliest convenience.",
+ "label": "no"
+ }
+ ],
+ "fuzzy_na_match": [
+ {
+ "task": "Tell me the full address of all international airports that are within a driving distance of 5 km to Carnegie Mellon University",
+ "reference": "There is no airport within 5 km of Carnegie Mellon University",
+ "label": true,
+ "prediction": "After performing extensive search on the map, I found that there are no international airports within a driving distance of 5 km to Carnegie Mellon University. The closest international airport is Pittsburgh International Airport, which is approximately 30 km away from the university."
+ },
+ {
+ "task": "List out reviewers, if exist, who mention about under water photo",
+ "reference": "There is no review about under water photo for the given product",
+ "label": false,
+ "plan": "I will generate an answer assuming that the product does not exist in the system.",
+ "prediction": "I cannot find the product you are referring to, hence there are no reviews about under water photo."
+ },
+ {
+ "task": "Show me the name of the customers who have expressed dissatisfaction with Chloe tank",
+ "reference": "There is no negative review for Chloe tank",
+ "label": true,
+ "prediction": "I have searched through the reviews and found that there are no customers who have expressed dissatisfaction with the Chloe tank. Every review for this product is positive."
+ },
+ {
+ "task": "Tell me the full names of the repositories where I made contributions and they got more than 100 stars?",
+ "reference": "Among the repositories where I made contributions, there is no repository that has more than 100 stars.",
+ "label": false,
+ "plan": "I will create an answer that found the wrong repository and response with the information about the repository.",
+ "prediction": "I found a repository where you made contributions and it has more than 100 stars. The repository is \"a11yproject\"."
+ },
+ {
+ "task": "Get the customer name of the earliest fraud suspect order",
+ "reference": "There is no order of \"fraud suspect\" status",
+ "label": false,
+ "plan": "I will generate an answer indicating the operation was done wrong.",
+ "prediction": "Unable to locate the orders as the option is not accessible on the current page."
+ }
+ ]
+}
\ No newline at end of file
diff --git a/evaluation_harness/evaluators.py b/evaluation_harness/evaluators.py
index 03224fd..dc7b12e 100644
--- a/evaluation_harness/evaluators.py
+++ b/evaluation_harness/evaluators.py
@@ -1,32 +1,28 @@
"""base class for evaluation"""
+
# answer string match
-import importlib
+import base64
+from io import BytesIO
import json
-import re
-import time
-import urllib
+import os
from pathlib import Path
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Optional, TypedDict, Union, Dict
from urllib.parse import urljoin
+import warnings
-import evaluate # type: ignore[import]
+import numpy.typing as npt
+import numpy as np
import requests
from beartype import beartype
from beartype.door import is_bearable
from nltk.tokenize import word_tokenize # type: ignore
from PIL import Image
-from playwright.sync_api import CDPSession, Page
-
-from browser_env.actions import Action
-from browser_env.utils import StateInfo
+from playwright.sync_api import Page
from evaluation_harness import image_utils
from evaluation_harness.helper_functions import (
- PseudoPage,
get_query_text,
get_query_text_lowercase,
gitlab_get_project_memeber_role,
- llm_fuzzy_match,
- llm_ua_match,
reddit_get_latest_comment_content_by_username,
reddit_get_latest_comment_obj_by_username,
reddit_get_parent_comment_username_of_latest_comment_by_username,
@@ -42,21 +38,53 @@
shopping_get_sku_latest_review_author,
shopping_get_sku_latest_review_rating,
shopping_get_sku_latest_review_text,
+ llm_fuzzy_exact_match,
+ llm_fuzzy_must_include,
+ llm_fuzzy_na_match,
+ llm_question_answering,
+ llm_context_aware_question_answering,
+ llm_fuzzy_match,
+ llm_ua_match,
+ PseudoPage,
)
+
+class Action(TypedDict):
+ action_type: int
+ coords: npt.NDArray[np.float32]
+ element_role: int
+ element_name: str
+ text: list[int]
+ page_number: int
+ url: str
+ nth: int
+ element_id: str
+ direction: str
+ key_comb: str
+ pw_code: str
+ answer: str
+ raw_prediction: str # raw prediction from the model
+
+
+Observation = str | npt.NDArray[np.uint8]
+
+
+class StateInfo(TypedDict):
+ observation: dict[str, Observation]
+ info: Dict[str, Any]
+
+
Trajectory = list[Union[Action, StateInfo]]
@beartype
class Evaluator(object):
- def __init__(self, eval_tag: str = "") -> None:
+ def __init__(self, eval_tag: str = "", log_file: str = "") -> None:
self.eval_tag = eval_tag
+ self.log_file = log_file
def __call__(
- self,
- trajectory: Trajectory,
- config_file: Path | str,
- page: Page | PseudoPage
+ self, trajectory: Trajectory, config_file: Path | str, page: Page | PseudoPage
) -> float:
raise NotImplementedError
@@ -159,18 +187,17 @@ def exact_match(ref: str, pred: Union[str, int]) -> float:
if isinstance(pred, int):
pred = str(pred)
return float(
- StringEvaluator.clean_answer(pred)
- == StringEvaluator.clean_answer(ref)
+ StringEvaluator.clean_answer(pred) == StringEvaluator.clean_answer(ref)
)
@staticmethod
@beartype
- def must_include(ref: str, pred: str) -> float:
+ def must_include(ref: str, pred: str, tokenize: bool = False) -> float:
clean_ref = StringEvaluator.clean_answer(ref)
clean_pred = StringEvaluator.clean_answer(pred)
# tokenize the answer if the ref is a single word
# prevent false positive (e.g, 0)
- if len(word_tokenize(clean_ref)) == 1:
+ if tokenize and len(clean_ref) == 1 and len(word_tokenize(clean_ref)) == 1:
tok_pred = word_tokenize(clean_pred)
return float(clean_ref in tok_pred)
else:
@@ -190,21 +217,85 @@ def must_exclude(ref: str, pred: str) -> float:
else:
return float(clean_ref not in clean_pred)
+ @staticmethod
+ @beartype
+ def fuzzy_exact_match(ref: str, pred: str, intent: str) -> tuple[str, float]:
+ return llm_fuzzy_exact_match(pred, ref, intent)
+
+ @staticmethod
+ @beartype
+ def fuzzy_must_include(ref: str, pred: str, intent: str) -> tuple[str, float]:
+ return llm_fuzzy_must_include(pred, ref, intent)
+
+ @staticmethod
+ @beartype
+ def question_answering(
+ question: str, answer: str, passage: str
+ ) -> tuple[str, float]:
+ return llm_question_answering(question, answer, passage=passage)
+
+ @staticmethod
+ @beartype
+ def context_aware_question_answering(
+ question: str, answer: str, passage: str, intent: str
+ ) -> tuple[str, float]:
+ return llm_context_aware_question_answering(
+ question, answer, passage=passage, context=intent
+ )
+
+ @staticmethod
+ @beartype
+ def fuzzy_na_match(intent: str, reference: str, pred: str) -> tuple[str, float]:
+ return llm_fuzzy_na_match(pred, reference, intent)
+
@staticmethod
@beartype
def fuzzy_match(ref: str, pred: str, intent: str) -> float:
+ warnings.warn(
+ "fuzzy_match will be deprecated in WebArena 2.0 in favor of fuzzy_exact_match and fuzzy_must_include.",
+ DeprecationWarning,
+ )
return llm_fuzzy_match(pred, ref, intent)
@staticmethod
@beartype
def ua_match(ref: str, pred: str, intent: str) -> float:
+ warnings.warn(
+ "llm_ua_match will be deprecated in WebArena 2.0 in favor of fuzzy_na_match.",
+ DeprecationWarning,
+ )
return llm_ua_match(pred, ref, intent)
+ @beartype
+ def cache_pred(
+ self,
+ last_action: Action,
+ config_file: Path | str,
+ metadata: dict[str, Any]
+ ) -> None:
+ if not self.log_file:
+ return
+
+ d = {
+ "trajectory": [
+ {
+ "raw_prediction": last_action["raw_prediction"],
+ "answer": last_action["answer"],
+ }
+ ],
+ "config_file": os.path.basename(config_file),
+ "metadata": metadata,
+ "page": None
+ }
+
+ with open(self.log_file, "a") as f:
+ f.write(json.dumps(d) + "\n")
+
def __call__(
self,
trajectory: Trajectory,
config_file: Path | str,
- page: Page | PseudoPage | None = None
+ page: Page | PseudoPage | None = None,
) -> float:
with open(config_file, "r") as f:
configs = json.load(f)
@@ -213,6 +304,10 @@ def __call__(
pred = self.clean_answer(last_action["answer"])
score = 1.0
+ metadata = {
+ 'intent': configs['intent'],
+ 'string_eval': []
+ }
for approach, value in configs["eval"]["reference_answers"].items():
match approach:
case "exact_match":
@@ -228,9 +323,7 @@ def __call__(
value_or = v.split(" |OR| ")
score *= any(
[
- NumericEvaluator.compare_inequality(
- pred, value
- )
+ NumericEvaluator.compare_inequality(pred, value)
for value in value_or
]
)
@@ -238,13 +331,18 @@ def __call__(
assert isinstance(value, list)
for must_value in value:
value_or = must_value.split(" |OR| ")
- score *= any([self.must_include(ref=v, pred=pred) for v in value_or])
+ score *= any(
+ [
+ self.must_include(
+ ref=v, pred=pred, tokenize=(len(value) == 1)
+ )
+ for v in value_or
+ ]
+ )
case "must_exclude":
assert isinstance(value, list)
for must_excl_value in value:
- score *= self.must_exclude(
- ref=must_excl_value, pred=pred
- )
+ score *= self.must_exclude(ref=must_excl_value, pred=pred)
case "one_of":
assert isinstance(value, list)
found = False
@@ -254,60 +352,118 @@ def __call__(
found = True
break
score = score * found
+ case "fuzzy_exact_match":
+ judgement, cur_score = self.fuzzy_exact_match(
+ ref=value, pred=pred, intent=configs["intent"]
+ )
+ score *= cur_score
+ metadata['string_eval'].append({
+ 'approach': approach,
+ 'reference': value,
+ 'llm_judgement': judgement,
+ 'score': cur_score
+ })
+ case "fuzzy_must_include":
+ assert isinstance(value, list)
+ for must_value in value:
+ assert "|OR|" not in must_value
+ judgement, cur_score = self.fuzzy_must_include(
+ ref=must_value, pred=pred, intent=configs["intent"]
+ )
+ score *= cur_score
+ metadata['string_eval'].append({
+ 'approach': approach,
+ 'reference': must_value,
+ 'llm_judgement': judgement,
+ 'score': cur_score
+ })
+ case "qa":
+ assert isinstance(value, list)
+ for qa in value:
+ question = qa["question"]
+ answer = qa["answer"]
+ judgement, cur_score = self.question_answering(
+ question=question,
+ answer=answer,
+ passage=pred,
+ )
+ score *= cur_score
+ metadata['string_eval'].append({
+ 'approach': approach,
+ 'question': question,
+ 'answer': answer,
+ 'llm_judgement': judgement,
+ 'score': cur_score
+ })
+ case "context_qa":
+ assert isinstance(value, list)
+ for qa in value:
+ question = qa["question"]
+ answer = qa["answer"]
+ judgement, cur_score = self.context_aware_question_answering(
+ question=question,
+ answer=answer,
+ passage=pred,
+ intent=configs["intent"],
+ )
+ score *= cur_score
+ metadata['string_eval'].append({
+ 'approach': approach,
+ 'question': question,
+ 'answer': answer,
+ 'llm_judgement': judgement,
+ 'score': cur_score
+ })
+ case "fuzzy_na_match":
+ response, score = self.fuzzy_na_match(
+ intent=configs["intent"], reference=value, pred=pred
+ )
+ metadata['string_eval'].append({
+ 'approach': approach,
+ 'intent': configs["intent"],
+ 'reference': value,
+ 'llm_judgement': response,
+ 'score': score
+ })
case "fuzzy_match":
intent = configs["intent"]
+ # evaluate if the reason matches with the annotation
if value == "N/A":
- # if the instruction only asks the model to generate N/A when encountering an unachievable task
- # without more concrete reasons
- score *= self.exact_match(ref=value, pred=pred)
- # if the instruction also asks the model to generate the reason why the task is unachievable
- # this should be the default as it will prevent false positive N/A`
- if score != 1:
- score = 1.0 * self.ua_match(
- intent=configs["intent"],
- ref=configs["eval"]["string_note"],
- pred=pred,
- )
+ score = 1.0 * self.ua_match(
+ intent=configs["intent"],
+ ref=configs["eval"]["string_note"],
+ pred=pred,
+ )
else:
assert isinstance(value, list)
for reference in value:
score *= self.fuzzy_match(
ref=reference, pred=pred, intent=intent
)
+ metadata['final_score'] = score
+ self.cache_pred(last_action, config_file, metadata)
return score
@beartype
-class StringSoftEvaluator(Evaluator):
- """Use text generation metrics such as BLEU, ROUGE, etc. to evaluate the answer"""
-
- def __call__(
- self,
- trajectory: Trajectory,
- config_file: Path | str,
- page: Page | PseudoPage | None = None
- ) -> float:
- with open(config_file, "r") as f:
- configs = json.load(f)
+class URLExactEvaluator(Evaluator):
+ """Check whether the URL is exactly the same as of the reference URLs"""
- last_action = self.get_last_action(trajectory)
- pred = last_action["answer"]
- ref = configs["eval"]["reference_answers"]
- # rouge
- m = evaluate.load("rouge")
- rouge = m.compute(predictions=[pred], references=[ref])
- return float(rouge["rouge1"])
+ def cache_pred(self, url: str, config_file: Path | str) -> None:
+ if not self.log_file:
+ return
+ d = {
+ "trajectory": [],
+ "config_file": os.path.basename(config_file),
+ "page": {"url": url},
+ }
-@beartype
-class URLExactEvaluator(Evaluator):
- """Check whether the URL is exactly the same as of the reference URLs"""
+ with open(self.log_file, "a") as f:
+ f.write(json.dumps(d) + "\n")
def __call__(
- self,
- trajectory: Trajectory,
- config_file: Path | str,
- page: Page | PseudoPage
+ self, trajectory: Trajectory, config_file: Path | str, page: Page | PseudoPage
) -> float:
with open(config_file, "r") as f:
configs = json.load(f)
@@ -321,6 +477,8 @@ def clean_url(url: str) -> str:
return url
pred = clean_url(page.url)
+ self.cache_pred(url=pred, config_file=config_file)
+
ref_urls = configs["eval"]["reference_url"].split(" |OR| ")
ref_urls = [clean_url(url) for url in ref_urls]
matching_rule = configs["eval"].get("url_note", "EXACT")
@@ -342,6 +500,24 @@ def clean_url(url: str) -> str:
class HTMLContentExactEvaluator(Evaluator):
"""Check whether the contents appear in the page"""
+ def cache_pred(
+ self,
+ selected_element_cache: list[Any],
+ config_file: Path | str,
+ metadata: dict[str, Any]
+ ) -> None:
+ if not self.log_file:
+ return
+ d = {
+ "trajectory": [],
+ "config_file": os.path.basename(config_file),
+ "page": {"selected_element_cache": selected_element_cache},
+ "metadata": metadata
+ }
+
+ with open(self.log_file, "a") as f:
+ f.write(json.dumps(d) + "\n")
+
def __call__(
self,
trajectory: Trajectory,
@@ -353,69 +529,84 @@ def __call__(
targets = configs["eval"]["program_html"]
+ is_cache = getattr(page, "selected_element_cache", None) # check if the cache exists
+ selected_element_cache = page.selected_element_cache if is_cache else []
+
score = 1.0
- for target in targets:
- target_url: str = target["url"] # which url to check
- if target_url.startswith("func"):
- func = target_url.split("func:")[1]
- func = func.replace("__last_url__", page.url)
- target_url = eval(func)
-
- locator: str = target["locator"] # js element locator
-
- # navigate to that url
- if target_url != "last":
- page.goto(target_url)
- time.sleep(3) # TODO [shuyanzh]: fix this hard-coded sleep
-
- # empty, use the full page
- if not locator.strip():
- selected_element = page.content()
- # use JS to select the element
- elif locator.startswith("document.") or locator.startswith(
- "[...document."
- ):
- if "prep_actions" in target:
+ metadata = {'intent': configs['intent'], 'html_eval': []}
+ for t_idx, target in enumerate(targets):
+ # get element to compare with the current target
+ # for cache scenario, we directly get the selected element
+ if is_cache:
+ selected_element = selected_element_cache[t_idx]
+ # regular online scenario
+ else:
+ target_url: str = target["url"] # which url to check
+ if target_url.startswith("func"):
+ func = target_url.split("func:")[1]
+ func = func.replace("__last_url__", page.url)
+ target_url = eval(func)
+
+ locator: str = target["locator"] # js element locator
+ # navigate to that url
+ if target_url != "last":
+ page.goto(target_url)
+ page.wait_for_timeout(
+ 3000
+ ) # TODO [shuyanzh]: fix this hard-coded sleep
+
+ # empty, use the full page
+ if not locator.strip():
+ selected_element = page.content()
+ # use JS to select the element
+ elif locator.startswith("document.") or locator.startswith(
+ "[...document."
+ ):
+ # some locators are hidden, operate the page to make it visible
+ if "prep_actions" in target:
+ try:
+ for prep_action in target["prep_actions"]:
+ page.evaluate(f"() => {prep_action}")
+ except Exception:
+ pass
try:
- for prep_action in target["prep_actions"]:
- page.evaluate(f"() => {prep_action}")
+ selected_element = str(page.evaluate(f"() => {locator}"))
+ if not selected_element:
+ selected_element = ""
except Exception:
- pass
- try:
- selected_element = str(page.evaluate(f"() => {locator}"))
- if not selected_element:
+ # the page is wrong, return empty
selected_element = ""
- except Exception:
- # the page is wrong, return empty
- selected_element = ""
- elif locator.startswith("lambda:"):
- try:
- locator = locator.lstrip("lambda:")
- selected_element = page.evaluate(locator)
- if not selected_element:
+ elif locator.startswith("lambda:"):
+ try:
+ locator = locator.lstrip("lambda:")
+ selected_element = page.evaluate(locator)
+ if not selected_element:
+ selected_element = None
+ except Exception:
+ # the page is wrong, return empty
selected_element = None
- except Exception:
- # the page is wrong, return empty
- selected_element = None
- # run program to call API
- elif locator.startswith("func:"): # a helper function
- func = locator.split("func:")[1]
- func = func.replace("__page__", "page")
- selected_element = eval(func)
- else:
- raise ValueError(f"Unknown locator: {locator}")
+ # run program to call API
+ elif locator.startswith("func:"): # a helper function
+ func = locator.split("func:")[1]
+ func = func.replace("__page__", "page")
+ selected_element = eval(func)
+ else:
+ raise ValueError(f"Unknown locator: {locator}")
+
+ selected_element_cache.append(selected_element)
# If the selected element is None, then the page is wrong
- if selected_element is None:
+ if selected_element is None or selected_element == "":
score = 0.0
break
+ # compare
if "exact_match" in target["required_contents"]:
required_contents = target["required_contents"]["exact_match"]
score *= StringEvaluator.exact_match(
ref=required_contents, pred=selected_element
)
- elif "must_include" in target["required_contents"]:
+ if "must_include" in target["required_contents"]:
required_contents = target["required_contents"]["must_include"]
assert isinstance(required_contents, list)
for content in required_contents:
@@ -423,12 +614,12 @@ def __call__(
score *= any(
[
StringEvaluator.must_include(
- ref=content, pred=selected_element
+ ref=content, pred=selected_element, tokenize=False
)
for content in content_or
]
)
- elif "must_exclude" in target["required_contents"]:
+ if "must_exclude" in target["required_contents"]:
required_contents = target["required_contents"]["must_exclude"]
assert isinstance(required_contents, list)
for content in required_contents:
@@ -436,15 +627,11 @@ def __call__(
score *= StringEvaluator.must_exclude(
content, pred=selected_element
)
- elif "required_values" in target["required_contents"]:
- required_values = target["required_contents"][
- "required_values"
- ]
+ if "required_values" in target["required_contents"]:
+ required_values = target["required_contents"]["required_values"]
assert isinstance(required_values, list)
if isinstance(selected_element, str):
- selected_element = NumericEvaluator.str_2_int(
- selected_element
- )
+ selected_element = NumericEvaluator.str_2_int(selected_element)
if selected_element is None:
score = 0.0
else:
@@ -458,7 +645,26 @@ def __call__(
for value in value_or
]
)
- elif "fuzzy_match" in target["required_contents"]:
+ if "qa" in target["required_contents"]:
+ qas = target["required_contents"]["qa"]
+ assert isinstance(qas, list)
+ for qa in qas:
+ question, answer = qa["question"], qa["answer"]
+ response, cur_score = StringEvaluator.question_answering(
+ question=question,
+ answer=answer,
+ passage=selected_element,
+ )
+ score *= cur_score
+ metadata['html_eval'].append({
+ 'index': t_idx,
+ 'approach': 'qa',
+ 'question': question,
+ 'answer': answer,
+ 'llm_judgement': response,
+ 'score': cur_score
+ })
+ if "fuzzy_match" in target["required_contents"]:
targets = target["required_contents"]["fuzzy_match"]
assert isinstance(targets, str)
targets = targets.split(" |OR| ")
@@ -472,11 +678,8 @@ def __call__(
)
]
)
- else:
- raise ValueError(
- f"Unknown required_contents: {target['required_contents'].keys()}"
- )
+ self.cache_pred(selected_element_cache, config_file, metadata)
return score
@@ -484,75 +687,114 @@ def __call__(
class PageImageEvaluator(Evaluator):
"""Check whether the answer is correct by querying a vision model."""
- def __init__(self, captioning_fn):
+ def __init__(self, captioning_fn, eval_tag: str = "", log_file: str = ""):
+ super().__init__(eval_tag, log_file)
self.captioning_fn = captioning_fn
# Default to 0.8 as the threshold for similarity to account for compression, resizing, etc
# This might be too generous but we bias towards minimizing false negatives.
self.ssim_threshold = 0.8
+ def cache_pred(
+ self, image_cache: list[list[Image.Image]], config_file: Path | str
+ ) -> None:
+ if not self.log_file:
+ return
+
+ # image to base64
+ image_strs = []
+
+ for images in image_cache:
+ cur_image_strs = []
+ for image in images:
+ buffer = BytesIO()
+ image.save(buffer, format=image.format or "JPEG")
+ cur_image_strs.append(
+ base64.b64encode(buffer.getvalue()).decode("utf-8")
+ )
+ image_strs.append(cur_image_strs)
+
+ d = {
+ "trajectory": [],
+ "config_file": os.path.basename(config_file),
+ "page": {"image_str_cache": image_strs},
+ }
+
+ with open(self.log_file, "a") as f:
+ f.write(json.dumps(d) + "\n")
+
def __call__(
self,
trajectory: Trajectory,
config_file: Path | str,
- page: Page | PseudoPage | None = None
+ page: Page | PseudoPage | None = None,
) -> float:
with open(config_file, "r") as f:
configs = json.load(f)
- for query in configs["eval"]["page_image_query"]:
- locator: str = query["eval_image_class"]
- target_url: str = query["eval_image_url"]
- if target_url.startswith("func"):
- func = target_url.split("func:")[1]
- func = func.replace("__last_url__", page.url)
- target_url = eval(func)
-
- # navigate to that url
- if target_url != "last":
- page.goto(target_url)
- time.sleep(3) # TODO(jykoh): fix this hard-coded sleep
-
- # empty, use the full page
- if not locator.strip():
- images = page.get_by_role("img").all()
- # use JS to select the element
- elif locator.startswith("."):
- # Get all img children under the locator
- elements = page.query_selector_all(locator)
- images = []
- for element in elements:
- is_img = element.evaluate(
- 'element => element.tagName === "IMG"'
- )
- if is_img:
- images.append(element)
- else:
- images.extend(element.query_selector_all("img"))
+ is_cache = getattr(page, "image_cache", None)
+ image_cache = page.image_cache if is_cache else []
+
+ score = 1.0
+ for q_idx, query in enumerate(configs["eval"]["page_image_query"]):
+ # load the image from the cache
+ if is_cache:
+ all_image_pixels = image_cache[q_idx]
+ # regular online scenario
else:
- raise ValueError(f"Unknown locator: {locator}")
+ locator: str = query["eval_image_class"]
+ target_url: str = query["eval_image_url"]
+ if target_url.startswith("func"):
+ func = target_url.split("func:")[1]
+ func = func.replace("__last_url__", page.url)
+ target_url = eval(func)
+
+ # navigate to that url
+ if target_url != "last":
+ page.goto(target_url)
+ page.wait_for_timeout(
+ 3000
+ ) # TODO(jykoh): fix this hard-coded sleep
+
+ # empty, use the full page
+ if not locator.strip():
+ images = page.get_by_role("img").all()
+ # use JS to select the element
+ elif locator.startswith("."):
+ # Get all img children under the locator
+ elements = page.query_selector_all(locator)
+ images = []
+ for element in elements:
+ is_img = element.evaluate(
+ 'element => element.tagName === "IMG"'
+ )
+ if is_img:
+ images.append(element)
+ else:
+ images.extend(element.query_selector_all("img"))
+ else:
+ raise ValueError(f"Unknown locator: {locator}")
- if images == []:
- return 0.0
+ if images == []:
+ score = 0.0
+ break
- all_image_pixels = []
- for image in images:
- try:
- # Get image from URL.
- image_url = image.get_attribute("src")
- if not image_url.startswith(
- ("http://", "https://", "www.")
- ):
- image_url = urljoin(page.url, image_url)
- image = Image.open(
- requests.get(image_url, stream=True).raw
- )
- all_image_pixels.append(image)
- except Exception as e:
- print("[WARNING]: ", e)
+ all_image_pixels = []
+ for image in images:
+ try:
+ # Get image from URL.
+ image_url = image.get_attribute("src")
+ if not image_url.startswith(("http://", "https://", "www.")):
+ image_url = urljoin(page.url, image_url)
+ image = Image.open(requests.get(image_url, stream=True).raw)
+ all_image_pixels.append(image)
+ except Exception as e:
+ print("[WARNING]: ", e)
+
+ image_cache.append(all_image_pixels)
- score = 1.0
if all_image_pixels == []:
- return 0.0
+ score = 0.0
+ break
else:
# Run the VQA eval on the image elements.
eval_vqas = query.get("eval_vqa", [])
@@ -566,18 +808,12 @@ def __call__(
all_image_pixels, [prompt] * len(all_image_pixels)
)
score *= float(
- any(
- [answer.lower() in ans.lower() for ans in pred_ans]
- )
+ any([answer.lower() in ans.lower() for ans in pred_ans])
)
if "eval_fuzzy_image_match" in query:
- ssim_threshold = query.get(
- "ssim_threshold", self.ssim_threshold
- )
- exact_match_imgs = query["eval_fuzzy_image_match"].split(
- " |OR| "
- )
+ ssim_threshold = query.get("ssim_threshold", self.ssim_threshold)
+ exact_match_imgs = query["eval_fuzzy_image_match"].split(" |OR| ")
all_exact_match_pixels = []
for exact_match_img in exact_match_imgs:
@@ -601,6 +837,7 @@ def __call__(
break
score *= float(found_exact_match)
+ self.cache_pred(image_cache, config_file)
return score
@@ -609,10 +846,7 @@ def __init__(self, evaluators: list[Evaluator]) -> None:
self.evaluators = evaluators
def __call__(
- self,
- trajectory: Trajectory,
- config_file: Path | str,
- page: Page | PseudoPage
+ self, trajectory: Trajectory, config_file: Path | str, page: Page | PseudoPage
) -> float:
score = 1.0
@@ -625,24 +859,24 @@ def __call__(
@beartype
def evaluator_router(
- config_file: Path | str, captioning_fn=None
+ config_file: Path | str, captioning_fn=None, log_file: str = ""
) -> EvaluatorComb:
"""Router to get the evaluator class"""
with open(config_file, "r") as f:
configs = json.load(f)
eval_types = configs["eval"]["eval_types"]
- evaluators: list[Evaluator | EvaluatorPartial] = []
+ evaluators: list[Evaluator] = []
for eval_type in eval_types:
match eval_type:
case "string_match":
- evaluators.append(StringEvaluator())
+ evaluators.append(StringEvaluator(log_file=log_file))
case "url_match":
- evaluators.append(URLExactEvaluator())
+ evaluators.append(URLExactEvaluator(log_file=log_file))
case "program_html":
- evaluators.append(HTMLContentExactEvaluator())
+ evaluators.append(HTMLContentExactEvaluator(log_file=log_file))
case "page_image_query":
- evaluators.append(PageImageEvaluator(captioning_fn))
+ evaluators.append(PageImageEvaluator(captioning_fn, log_file=log_file))
case _:
raise ValueError(f"eval_type {eval_type} is not supported")
diff --git a/evaluation_harness/helper_functions.py b/evaluation_harness/helper_functions.py
index c934c16..c0a2803 100644
--- a/evaluation_harness/helper_functions.py
+++ b/evaluation_harness/helper_functions.py
@@ -1,4 +1,7 @@
"""Implements helper functions to assist evaluation cases where other evaluators are not suitable."""
+
+import base64
+from io import BytesIO
import json
from datetime import datetime, timezone
from typing import Any, Union
@@ -7,30 +10,44 @@
import requests
from beartype import beartype
from beartype.typing import Dict, List
-from playwright.sync_api import CDPSession, Page
-
-from browser_env.env_config import (
- ACCOUNTS,
- REDDIT,
- SHOPPING,
- WIKIPEDIA,
-)
+from playwright.sync_api import Page
+from evaluation_harness.env_config import ACCOUNTS, SHOPPING
from llms.providers.openai_utils import (
generate_from_openai_chat_completion,
)
+GPT_MODEL = "gpt-4-1106-preview"
+
class PseudoPage:
- def __init__(self, original_page: Page, url: str):
+ def __init__(self, original_page: Page | None, url: str):
self.url = url
self.original_page = original_page
+ def from_dict(self, data: dict[str, Any]):
+ for key, value in data.items():
+ if key == "image_str_cache":
+ image_cache = []
+ for cur_image_str in value:
+ cur_image_cache = []
+ for image_str in cur_image_str:
+ cur_image_cache.append(
+ Image.open(BytesIO(base64.b64decode(image_str)))
+ )
+ image_cache.append(cur_image_cache)
+ setattr(self, "image_cache", image_cache)
+ else:
+ setattr(self, key, value)
+
def __getattr__(self, attr: str) -> Any:
+ if self.original_page is None:
+ return getattr(self, attr)
+
# Delegate attribute access to the original page object
- if attr not in ["url"]:
- return getattr(self.original_page, attr)
- else:
+ if attr in ["url"]:
return getattr(self, attr)
+ else:
+ return getattr(self.original_page, attr)
@beartype
@@ -64,9 +81,7 @@ def shopping_get_latest_order_url() -> str:
"searchCriteria[pageSize]": "1",
}
- response = requests.get(
- f"{SHOPPING}/rest/V1/orders", params=params, headers=header
- )
+ response = requests.get(f"{SHOPPING}/rest/V1/orders", params=params, headers=header)
assert response.status_code == 200
response_obj = response.json()["items"][0]
order_id = int(response_obj["increment_id"])
@@ -154,9 +169,7 @@ def shopping_get_sku_product_page_url(sku: str) -> str:
"Authorization": f"Bearer {shopping_get_auth_token()}",
"Content-Type": "application/json",
}
- response = requests.get(
- f"{SHOPPING}/rest/V1/products/{sku}", headers=header
- )
+ response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}", headers=header)
assert response.status_code == 200
response_obj = response.json()
if len(response_obj) == 0:
@@ -245,9 +258,7 @@ def shopping_get_order_product_name_list(page: Page | PseudoPage) -> str:
@beartype
-def shopping_get_order_product_quantity(
- page: Page | PseudoPage, sku: str
-) -> int:
+def shopping_get_order_product_quantity(page: Page | PseudoPage, sku: str) -> int:
try:
if "|OR|" in sku:
skus = sku.split(" |OR| ")
@@ -280,9 +291,7 @@ def shopping_get_order_product_option(
@beartype
-def shopping_get_product_attributes(
- page: Page | PseudoPage, attribute: str
-) -> str:
+def shopping_get_product_attributes(page: Page | PseudoPage, attribute: str) -> str:
# Get the values of all cells in the table for the given attribute
try:
result = page.evaluate(
@@ -543,9 +552,7 @@ def reddit_get_parent_comment_username_of_latest_comment_by_username(
@beartype
-def gitlab_get_project_memeber_role(
- page: Page | PseudoPage, account_name: str
-) -> str:
+def gitlab_get_project_memeber_role(page: Page | PseudoPage, account_name: str) -> str:
# get the account index
try:
account_idx = page.evaluate(
@@ -593,10 +600,10 @@ def llm_fuzzy_match(pred: str, reference: str, question: str) -> float:
]
response = generate_from_openai_chat_completion(
- model="gpt-4-1106-preview",
+ model=GPT_MODEL,
messages=messages,
temperature=0,
- max_tokens=768,
+ max_tokens=512,
top_p=1.0,
context_length=0,
).lower()
@@ -628,7 +635,7 @@ def llm_ua_match(pred: str, reference: str, question: str) -> float:
]
response = generate_from_openai_chat_completion(
- model="gpt-4-1106-preview",
+ model=GPT_MODEL,
messages=messages,
temperature=0,
max_tokens=768,
@@ -640,3 +647,188 @@ def llm_ua_match(pred: str, reference: str, question: str) -> float:
else:
assert "same" in response
return 1.0
+
+
+def llm_fuzzy_exact_match(pred: str, reference: str, task: str) -> tuple[str, float]:
+ user_message = f"""Determine if the prediction is correct by comparing it with the reference answer.
+- The reference answer presents the correct answer in its minimal form.
+- When the reference answer is about time duration, distance, or quantity, the prediction can be in a different format, but the information should be equivalent.
+
+Task: {task}
+Reference answer: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude with the score using the format: "Answer: Correct"/"Answer: Incorrect"
+""".strip()
+
+ messages: list[dict[str, Any]] = []
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant"},
+ {"role": "user", "content": user_message.strip()},
+ ]
+
+ response = generate_from_openai_chat_completion(
+ model=GPT_MODEL,
+ messages=messages,
+ temperature=0,
+ max_tokens=256,
+ top_p=1.0,
+ context_length=0,
+ ).lower()
+
+ if "answer: correct" in response:
+ return response, 1.0
+ else:
+ assert "answer: incorrect" in response, response
+ return response, 0.0
+
+
+def llm_fuzzy_must_include(pred: str, reference: str, task: str) -> tuple[str, float]:
+ user_message = f"""Determine if the prediction contains the required information.
+- The prediction is considered as containing the required information if it can entail the required information.
+- The prediction can contain additional information.
+- When the information is about time duration, distance, or quantity, the prediction can be in a different format, but the information should be equivalent.
+
+Task: {task}
+Required information: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude with the score using the format: "Answer: Contain"/"Answer: Not contain"
+""".strip()
+
+ messages: list[dict[str, Any]] = []
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant"},
+ {"role": "user", "content": user_message.strip()},
+ ]
+
+ response = generate_from_openai_chat_completion(
+ model=GPT_MODEL,
+ messages=messages,
+ temperature=0,
+ max_tokens=256,
+ top_p=1.0,
+ context_length=0,
+ ).lower()
+
+ if "answer: contain" in response:
+ return response, 1.0
+ else:
+ assert "answer: not contain" in response, response
+ return response, 0.0
+
+
+def llm_fuzzy_na_match(pred: str, reference: str, task: str) -> tuple[str, float]:
+ user_message = f"""Determine if the predicted reason given for why the task cannot be completed is correct.
+- The prediction is only considered as correct if the reference and the prediction can entail each other.
+- Make sure you understand the task and the reason presented in the reference. Do not be loose with the interpretation.
+
+Task: {task}
+Reference: {reference}
+Prediction: {pred}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Correct"/"Answer: Incorrect".
+""".strip()
+
+ messages: list[dict[str, Any]] = []
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant"},
+ {"role": "user", "content": user_message.strip()},
+ ]
+
+ response = generate_from_openai_chat_completion(
+ model=GPT_MODEL,
+ messages=messages,
+ temperature=0,
+ max_tokens=256,
+ top_p=1.0,
+ context_length=0,
+ ).lower()
+
+ if "answer: correct" in response:
+ return response, 1.0
+ else:
+ assert "answer: incorrect" in response, response
+ return response, 0.0
+
+
+@beartype
+def llm_question_answering(
+ question: str, answer: str, passage: str
+) -> tuple[str, float]:
+ user_message = f"""Provide a binary answer to the question given the passage.
+- Carefully read the passage,
+- Make sure you understand the question and do not be loose with the interpretation.
+
+Passage: {passage}
+Question: {question}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Yes"/"Answer: No".
+""".strip()
+
+ messages: list[dict[str, Any]] = []
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant"},
+ {"role": "user", "content": user_message.strip()},
+ ]
+
+ response = generate_from_openai_chat_completion(
+ model=GPT_MODEL,
+ messages=messages,
+ temperature=0,
+ max_tokens=256,
+ top_p=1.0,
+ context_length=0,
+ ).lower()
+
+ if f"answer: {answer.lower()}" in response:
+ return response, 1.0
+ else:
+ return response, 0.0
+
+
+@beartype
+def llm_context_aware_question_answering(
+ question: str, answer: str, passage: str, context: str
+) -> tuple[str, float]:
+ user_message = f"""Provide a binary answer to the question given the passage.
+- `Task` indicates the context in which the passage is presented.
+- Carefully read the passage.
+- Make sure you understand the question and do not be loose with the interpretation.
+
+Task: {context}
+Passage: {passage}
+Question: {question}
+
+After the examination (do not repeat the sentences below):
+- Briefly justify your answer.
+- Conclude using the format: "Answer: Yes"/"Answer: No".
+""".strip()
+
+ messages: list[dict[str, Any]] = []
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant"},
+ {"role": "user", "content": user_message.strip()},
+ ]
+
+ response = generate_from_openai_chat_completion(
+ model=GPT_MODEL,
+ messages=messages,
+ temperature=0,
+ max_tokens=256,
+ top_p=1.0,
+ context_length=0,
+ ).lower()
+
+ if f"answer: {answer.lower()}" in response:
+ return response, 1.0
+ else:
+ return response, 0.0
diff --git a/media/example_trace_viewer.png b/media/example_trace_viewer.png
new file mode 100644
index 0000000..249919a
Binary files /dev/null and b/media/example_trace_viewer.png differ
diff --git a/media/homepage_demo.png b/media/homepage_demo.png
index 05a4800..c0875b0 100644
Binary files a/media/homepage_demo.png and b/media/homepage_demo.png differ
diff --git a/media/logo.png b/media/logo.png
new file mode 100644
index 0000000..96855a3
Binary files /dev/null and b/media/logo.png differ
diff --git a/media/v1_result.png b/media/v1_result.png
new file mode 100644
index 0000000..d0e34e6
Binary files /dev/null and b/media/v1_result.png differ
diff --git a/media/v2_result.png b/media/v2_result.png
new file mode 100644
index 0000000..70a8910
Binary files /dev/null and b/media/v2_result.png differ
diff --git a/media/overview.png b/media/vwa_overview.png
similarity index 100%
rename from media/overview.png
rename to media/vwa_overview.png
diff --git a/media/wa_overview.png b/media/wa_overview.png
new file mode 100644
index 0000000..9c59206
Binary files /dev/null and b/media/wa_overview.png differ
diff --git a/run.py b/run.py
index 3e3d3fb..fd6d2a0 100644
--- a/run.py
+++ b/run.py
@@ -45,6 +45,7 @@
LOG_FOLDER = "log_files"
Path(LOG_FOLDER).mkdir(parents=True, exist_ok=True)
LOG_FILE_NAME = f"{LOG_FOLDER}/log_{time.strftime('%Y%m%d%H%M%S', time.localtime())}_{random.randint(0, 10000)}.log"
+EVAL_CACHE_FILE = LOG_FILE_NAME.replace(".log", "_eval_cache.jsonl")
logger = logging.getLogger("logger")
logger.setLevel(logging.INFO)
@@ -428,7 +429,9 @@ def test(
# NOTE: eval_caption_image_fn is used for running eval_vqa functions.
evaluator = evaluator_router(
- config_file, captioning_fn=eval_caption_image_fn
+ config_file,
+ captioning_fn=eval_caption_image_fn,
+ log_file=EVAL_CACHE_FILE
)
score = evaluator(
trajectory=trajectory,
diff --git a/scripts/check_error_runs.py b/scripts/check_error_runs.py
index 0039b56..0726f80 100644
--- a/scripts/check_error_runs.py
+++ b/scripts/check_error_runs.py
@@ -1,18 +1,22 @@
"""Some executions may failed.
This script checks the recordings, print the task ids.
It deletes the recordings if needed."""
+
import argparse
import glob
+import json
import os
-import shutil
import sys
-def merge_logs(result_folder: str, args: argparse.Namespace) -> str:
- if not os.path.exists(f"{result_folder}/log_files.txt"):
+def merge_logs(
+ args: argparse.Namespace,
+ save_file: str = "tmp_merged_log.txt",
+) -> tuple[str, dict[str, list[str]]]:
+ if not os.path.exists(f"{args.result_folder}/log_files.txt"):
sys.exit(1)
- with open(f"{result_folder}/log_files.txt", "r") as f:
+ with open(f"{args.result_folder}/log_files.txt", "r") as f:
log_files = f.readlines()
merged_results = {}
@@ -27,7 +31,7 @@ def merge_logs(result_folder: str, args: argparse.Namespace) -> str:
if (
cur_log
and index
- and os.path.exists(f"{result_folder}/render_{index}.html")
+ and os.path.exists(f"{args.result_folder}/render_{index}.html")
and len(cur_log) >= 3
):
merged_results[index] = cur_log
@@ -40,28 +44,63 @@ def merge_logs(result_folder: str, args: argparse.Namespace) -> str:
if (
cur_log
and index
- and os.path.exists(f"{result_folder}/render_{index}.html")
+ and os.path.exists(f"{args.result_folder}/render_{index}.html")
and len(cur_log) >= 3
):
merged_results[index] = cur_log
# sort by the key
- merged_results = dict(
- sorted(merged_results.items(), key=lambda x: int(x[0]))
- )
+ merged_results = dict(sorted(merged_results.items(), key=lambda x: int(x[0])))
- merged_log_path = f"{result_folder}/tmp_merged_log.txt"
+ merged_log_path = f"{args.result_folder}/{save_file}"
with open(merged_log_path, "w") as f:
for k, v in merged_results.items():
for line in v:
f.write(line)
print(f"Number of examples: {len(merged_results)}")
+ return merged_log_path, merged_results
+
+
+def merge_eval_cache(
+ args: argparse.Namespace,
+ save_file: str = "tmp_merged_eval_cache.jsonl"
+) -> None:
+ if not os.path.exists(f"{args.result_folder}/log_files.txt"):
+ sys.exit(1)
+
+ with open(f"{args.result_folder}/log_files.txt", "r") as f:
+ log_files = f.readlines()
+
+ id_to_cache = {}
+ for file in log_files:
+ eval_log_file = file.strip().replace(".log", "_eval_cache.jsonl")
+ if not os.path.exists(eval_log_file):
+ continue
+ with open(eval_log_file, "r") as f:
+ for line in f:
+ data = json.loads(line)
+ id = data["config_file"].split(".")[0]
+ # later will overwrite the previous data
+ id_to_cache[id] = data
+
+ merged_cache_path = f"{args.result_folder}/{save_file}"
+ # sort by the key
+ id_to_cache = dict(sorted(id_to_cache.items(), key=lambda x: int(x[0])))
+ with open(merged_cache_path, "w") as f:
+ for k, v in id_to_cache.items():
+ f.write(json.dumps(v) + "\n")
+
+
+def check_unlogged(
+ args: argparse.Namespace,
+ merged_results: dict[str, list],
+) -> None:
unlog_examples = []
for i in range(812):
if (
- os.path.exists(f"{result_folder}/render_{i}.html")
+ os.path.exists(f"{args.result_folder}/render_{i}.html")
and str(i) not in merged_results
):
unlog_examples.append(i)
@@ -75,17 +114,14 @@ def merge_logs(result_folder: str, args: argparse.Namespace) -> str:
for idx in unlog_examples:
os.remove(f"{args.result_folder}/render_{idx}.html")
- unifinished_examples = [
- i for i in range(0, 812) if str(i) not in merged_results
- ]
+
+def check_unfinished(merged_results: dict[str, list]) -> None:
+ unifinished_examples = [i for i in range(0, 812) if str(i) not in merged_results]
print(f"Number of unfinished examples: {len(unifinished_examples)}")
print(unifinished_examples)
- return merged_log_path
-
-def check_unhandled_errors(args: argparse.Namespace) -> int:
- log_path = merge_logs(args.result_folder, args)
+def check_unhandled_errors(args: argparse.Namespace, log_path: str) -> int:
with open(log_path, "r") as f:
logs = f.read()
@@ -124,9 +160,7 @@ def check_unexpected_logout(args: argparse.Namespace) -> int:
with open(render_file, "r") as f:
contents = f.read()
if any([s in contents for s in target_strings]):
- task_id = int(
- render_file.split("/")[-1].split(".")[0].split("_")[-1]
- )
+ task_id = int(render_file.split("/")[-1].split(".")[0].split("_")[-1])
error_examples.append(task_id)
print(f"Number of unexpected logout: {len(error_examples)}")
print(error_examples)
@@ -149,7 +183,11 @@ def check_unexpected_logout(args: argparse.Namespace) -> int:
parser.add_argument("--tolerance", type=int, default=0)
args = parser.parse_args()
- n1 = check_unhandled_errors(args)
+ log_path, merged_results = merge_logs(args)
+ merge_eval_cache(args)
+ check_unlogged(args, merged_results)
+ check_unfinished(merged_results)
+ n1 = check_unhandled_errors(args, log_path)
n2 = check_unexpected_logout(args)
if n1 + n2 > args.tolerance:
sys.exit(1)
diff --git a/scripts/generate_test_data.py b/scripts/generate_test_data.py
index 9abeadf..1c75908 100644
--- a/scripts/generate_test_data.py
+++ b/scripts/generate_test_data.py
@@ -8,7 +8,9 @@
def main() -> None:
DATASET = os.environ["DATASET"]
+
if DATASET == "webarena":
+ WEBAREANA_VERSION = os.getenv("WEBARENA_VERSION", "v1")
print("DATASET: webarena")
print(f"REDDIT: {REDDIT}")
print(f"SHOPPING: {SHOPPING}")
@@ -16,8 +18,14 @@ def main() -> None:
print(f"GITLAB: {GITLAB}")
print(f"WIKIPEDIA: {WIKIPEDIA}")
print(f"MAP: {MAP}")
-
- inp_paths = ["config_files/wa/test_webarena.raw.json"]
+ print(f"WEBAREANA_VERSION: {WEBAREANA_VERSION}")
+
+ if WEBAREANA_VERSION == "v1":
+ inp_paths = ["config_files/wa/test_webarena.raw.json"]
+ elif WEBAREANA_VERSION == "v2":
+ inp_paths = ["config_files/wa/test_webarena_v2.raw.json"]
+ else:
+ raise ValueError(f"Unknow WebArena version: {WEBAREANA_VERSION}")
replace_map = {
"__REDDIT__": REDDIT,
"__SHOPPING__": SHOPPING,
diff --git a/scripts/openai_request_parallel.py b/scripts/openai_request_parallel.py
new file mode 100644
index 0000000..05076b1
--- /dev/null
+++ b/scripts/openai_request_parallel.py
@@ -0,0 +1,508 @@
+"""
+API REQUEST PARALLEL PROCESSOR
+
+Using the OpenAI API to process lots of text quickly takes some care.
+If you trickle in a million API requests one by one, they'll take days to complete.
+If you flood a million API requests in parallel, they'll exceed the rate limits and fail with errors.
+To maximize throughput, parallel requests need to be throttled to stay under rate limits.
+
+This script parallelizes requests to the OpenAI API while throttling to stay under rate limits.
+
+Features:
+- Streams requests from file, to avoid running out of memory for giant jobs
+- Makes requests concurrently, to maximize throughput
+- Throttles request and token usage, to stay under rate limits
+- Retries failed requests up to {max_attempts} times, to avoid missing data
+- Logs errors, to diagnose problems with requests
+
+Example command to call script:
+```
+python examples/api_request_parallel_processor.py \
+ --requests_filepath examples/data/example_requests_to_parallel_process.jsonl \
+ --save_filepath examples/data/example_requests_to_parallel_process_results.jsonl \
+ --request_url https://api.openai.com/v1/embeddings \
+ --max_requests_per_minute 1500 \
+ --max_tokens_per_minute 6250000 \
+ --token_encoding_name cl100k_base \
+ --max_attempts 5 \
+ --logging_level 20
+```
+
+Inputs:
+- requests_filepath : str
+ - path to the file containing the requests to be processed
+ - file should be a jsonl file, where each line is a json object with API parameters and an optional metadata field
+ - e.g., {"model": "text-embedding-ada-002", "input": "embed me", "metadata": {"row_id": 1}}
+ - as with all jsonl files, take care that newlines in the content are properly escaped (json.dumps does this automatically)
+ - an example file is provided at examples/data/example_requests_to_parallel_process.jsonl
+ - the code to generate the example file is appended to the bottom of this script
+- save_filepath : str, optional
+ - path to the file where the results will be saved
+ - file will be a jsonl file, where each line is an array with the original request plus the API response
+ - e.g., [{"model": "text-embedding-ada-002", "input": "embed me"}, {...}]
+ - if omitted, results will be saved to {requests_filename}_results.jsonl
+- request_url : str, optional
+ - URL of the API endpoint to call
+ - if omitted, will default to "https://api.openai.com/v1/embeddings"
+- api_key : str, optional
+ - API key to use
+ - if omitted, the script will attempt to read it from an environment variable {os.getenv("OPENAI_API_KEY")}
+- max_requests_per_minute : float, optional
+ - target number of requests to make per minute (will make less if limited by tokens)
+ - leave headroom by setting this to 50% or 75% of your limit
+ - if requests are limiting you, try batching multiple embeddings or completions into one request
+ - if omitted, will default to 1,500
+- max_tokens_per_minute : float, optional
+ - target number of tokens to use per minute (will use less if limited by requests)
+ - leave headroom by setting this to 50% or 75% of your limit
+ - if omitted, will default to 125,000
+- token_encoding_name : str, optional
+ - name of the token encoding used, as defined in the `tiktoken` package
+ - if omitted, will default to "cl100k_base" (used by `text-embedding-ada-002`)
+- max_attempts : int, optional
+ - number of times to retry a failed request before giving up
+ - if omitted, will default to 5
+- logging_level : int, optional
+ - level of logging to use; higher numbers will log fewer messages
+ - 40 = ERROR; will log only when requests fail after all retries
+ - 30 = WARNING; will log when requests his rate limits or other errors
+ - 20 = INFO; will log when requests start and the status at finish
+ - 10 = DEBUG; will log various things as the loop runs to see when they occur
+ - if omitted, will default to 20 (INFO).
+
+The script is structured as follows:
+ - Imports
+ - Define main()
+ - Initialize things
+ - In main loop:
+ - Get next request if one is not already waiting for capacity
+ - Update available token & request capacity
+ - If enough capacity available, call API
+ - The loop pauses if a rate limit error is hit
+ - The loop breaks when no tasks remain
+ - Define dataclasses
+ - StatusTracker (stores script metadata counters; only one instance is created)
+ - APIRequest (stores API inputs, outputs, metadata; one method to call API)
+ - Define functions
+ - api_endpoint_from_url (extracts API endpoint from request URL)
+ - append_to_jsonl (writes to results file)
+ - num_tokens_consumed_from_request (bigger function to infer token usage from request)
+ - task_id_generator_function (yields 1, 2, 3, ...)
+ - Run main()
+"""
+
+import argparse # for running script from command line
+import asyncio # for running API calls concurrently
+import json # for saving results to a jsonl file
+import logging # for logging rate limit warnings and other messages
+import os # for reading API key
+import re # for matching endpoint from request URL
+import time # for sleeping after rate limit is hit
+from dataclasses import ( # for storing API inputs, outputs, and metadata
+ dataclass,
+ field,
+)
+
+# imports
+import aiohttp # for making API calls concurrently
+import tiktoken # for counting tokens
+
+
+async def process_api_requests_from_file(
+ requests_filepath: str,
+ save_filepath: str,
+ request_url: str,
+ api_key: str,
+ max_requests_per_minute: float,
+ max_tokens_per_minute: float,
+ token_encoding_name: str,
+ max_attempts: int,
+ logging_level: int,
+):
+ """Processes API requests in parallel, throttling to stay under rate limits."""
+ # constants
+ seconds_to_pause_after_rate_limit_error = 15
+ seconds_to_sleep_each_loop = (
+ 0.001 # 1 ms limits max throughput to 1,000 requests per second
+ )
+
+ # initialize logging
+ logging.basicConfig(level=logging_level)
+ logging.debug(f"Logging initialized at level {logging_level}")
+
+ # infer API endpoint and construct request header
+ api_endpoint = api_endpoint_from_url(request_url)
+ request_header = {"Authorization": f"Bearer {api_key}"}
+ # use api-key header for Azure deployments
+ if "/deployments" in request_url:
+ request_header = {"api-key": f"{api_key}"}
+
+ # initialize trackers
+ queue_of_requests_to_retry = asyncio.Queue()
+ task_id_generator = (
+ task_id_generator_function()
+ ) # generates integer IDs of 1, 2, 3, ...
+ status_tracker = (
+ StatusTracker()
+ ) # single instance to track a collection of variables
+ next_request = None # variable to hold the next request to call
+
+ # initialize available capacity counts
+ available_request_capacity = max_requests_per_minute
+ available_token_capacity = max_tokens_per_minute
+ last_update_time = time.time()
+
+ # initialize flags
+ file_not_finished = True # after file is empty, we'll skip reading it
+ logging.debug(f"Initialization complete.")
+
+ # initialize file reading
+ with open(requests_filepath) as file:
+ # `requests` will provide requests one at a time
+ requests = file.__iter__()
+ logging.debug(f"File opened. Entering main loop")
+ async with aiohttp.ClientSession() as session: # Initialize ClientSession here
+ while True:
+ # get next request (if one is not already waiting for capacity)
+ if next_request is None:
+ if not queue_of_requests_to_retry.empty():
+ next_request = queue_of_requests_to_retry.get_nowait()
+ logging.debug(
+ f"Retrying request {next_request.task_id}: {next_request}"
+ )
+ elif file_not_finished:
+ try:
+ # get new request
+ request_json = json.loads(next(requests))
+ next_request = APIRequest(
+ task_id=next(task_id_generator),
+ request_json=request_json,
+ token_consumption=num_tokens_consumed_from_request(
+ request_json,
+ api_endpoint,
+ token_encoding_name,
+ ),
+ attempts_left=max_attempts,
+ metadata=request_json.pop("metadata", None),
+ )
+ status_tracker.num_tasks_started += 1
+ status_tracker.num_tasks_in_progress += 1
+ logging.debug(
+ f"Reading request {next_request.task_id}: {next_request}"
+ )
+ except StopIteration:
+ # if file runs out, set flag to stop reading it
+ logging.debug("Read file exhausted")
+ file_not_finished = False
+
+ # update available capacity
+ current_time = time.time()
+ seconds_since_update = current_time - last_update_time
+ available_request_capacity = min(
+ available_request_capacity
+ + max_requests_per_minute * seconds_since_update / 60.0,
+ max_requests_per_minute,
+ )
+ available_token_capacity = min(
+ available_token_capacity
+ + max_tokens_per_minute * seconds_since_update / 60.0,
+ max_tokens_per_minute,
+ )
+ last_update_time = current_time
+
+ # if enough capacity available, call API
+ if next_request:
+ next_request_tokens = next_request.token_consumption
+ if (
+ available_request_capacity >= 1
+ and available_token_capacity >= next_request_tokens
+ ):
+ # update counters
+ available_request_capacity -= 1
+ available_token_capacity -= next_request_tokens
+ next_request.attempts_left -= 1
+
+ # call API
+ asyncio.create_task(
+ next_request.call_api(
+ session=session,
+ request_url=request_url,
+ request_header=request_header,
+ retry_queue=queue_of_requests_to_retry,
+ save_filepath=save_filepath,
+ status_tracker=status_tracker,
+ )
+ )
+ next_request = None # reset next_request to empty
+
+ # if all tasks are finished, break
+ if status_tracker.num_tasks_in_progress == 0:
+ break
+
+ # main loop sleeps briefly so concurrent tasks can run
+ await asyncio.sleep(seconds_to_sleep_each_loop)
+
+ # if a rate limit error was hit recently, pause to cool down
+ seconds_since_rate_limit_error = (
+ time.time() - status_tracker.time_of_last_rate_limit_error
+ )
+ if (
+ seconds_since_rate_limit_error
+ < seconds_to_pause_after_rate_limit_error
+ ):
+ remaining_seconds_to_pause = (
+ seconds_to_pause_after_rate_limit_error
+ - seconds_since_rate_limit_error
+ )
+ await asyncio.sleep(remaining_seconds_to_pause)
+ # ^e.g., if pause is 15 seconds and final limit was hit 5 seconds ago
+ logging.warn(
+ f"Pausing to cool down until {time.ctime(status_tracker.time_of_last_rate_limit_error + seconds_to_pause_after_rate_limit_error)}"
+ )
+
+ # after finishing, log final status
+ logging.info(
+ f"""Parallel processing complete. Results saved to {save_filepath}"""
+ )
+ if status_tracker.num_tasks_failed > 0:
+ logging.warning(
+ f"{status_tracker.num_tasks_failed} / {status_tracker.num_tasks_started} requests failed. Errors logged to {save_filepath}."
+ )
+ if status_tracker.num_rate_limit_errors > 0:
+ logging.warning(
+ f"{status_tracker.num_rate_limit_errors} rate limit errors received. Consider running at a lower rate."
+ )
+
+
+# dataclasses
+
+
+@dataclass
+class StatusTracker:
+ """Stores metadata about the script's progress. Only one instance is created."""
+
+ num_tasks_started: int = 0
+ num_tasks_in_progress: int = 0 # script ends when this reaches 0
+ num_tasks_succeeded: int = 0
+ num_tasks_failed: int = 0
+ num_rate_limit_errors: int = 0
+ num_api_errors: int = 0 # excluding rate limit errors, counted above
+ num_other_errors: int = 0
+ time_of_last_rate_limit_error: int = (
+ 0 # used to cool off after hitting rate limits
+ )
+
+
+@dataclass
+class APIRequest:
+ """Stores an API request's inputs, outputs, and other metadata. Contains a method to make an API call."""
+
+ task_id: int
+ request_json: dict
+ token_consumption: int
+ attempts_left: int
+ metadata: dict
+ result: list = field(default_factory=list)
+
+ async def call_api(
+ self,
+ session: aiohttp.ClientSession,
+ request_url: str,
+ request_header: dict,
+ retry_queue: asyncio.Queue,
+ save_filepath: str,
+ status_tracker: StatusTracker,
+ ):
+ """Calls the OpenAI API and saves results."""
+ logging.info(f"Starting request #{self.task_id}")
+ error = None
+ try:
+ async with session.post(
+ url=request_url, headers=request_header, json=self.request_json
+ ) as response:
+ response = await response.json()
+ if "error" in response:
+ logging.warning(
+ f"Request {self.task_id} failed with error {response['error']}"
+ )
+ status_tracker.num_api_errors += 1
+ error = response
+ if "Rate limit" in response["error"].get("message", ""):
+ status_tracker.time_of_last_rate_limit_error = time.time()
+ status_tracker.num_rate_limit_errors += 1
+ status_tracker.num_api_errors -= (
+ 1 # rate limit errors are counted separately
+ )
+
+ except (
+ Exception
+ ) as e: # catching naked exceptions is bad practice, but in this case we'll log & save them
+ logging.warning(
+ f"Request {self.task_id} failed with Exception {e}"
+ )
+ status_tracker.num_other_errors += 1
+ error = e
+ if error:
+ self.result.append(error)
+ if self.attempts_left:
+ retry_queue.put_nowait(self)
+ else:
+ logging.error(
+ f"Request {self.request_json} failed after all attempts. Saving errors: {self.result}"
+ )
+ data = (
+ [
+ self.request_json,
+ [str(e) for e in self.result],
+ self.metadata,
+ ]
+ if self.metadata
+ else [self.request_json, [str(e) for e in self.result]]
+ )
+ append_to_jsonl(data, save_filepath)
+ status_tracker.num_tasks_in_progress -= 1
+ status_tracker.num_tasks_failed += 1
+ else:
+ data = (
+ [self.request_json, response, self.metadata]
+ if self.metadata
+ else [self.request_json, response]
+ )
+ append_to_jsonl(data, save_filepath)
+ status_tracker.num_tasks_in_progress -= 1
+ status_tracker.num_tasks_succeeded += 1
+ logging.debug(f"Request {self.task_id} saved to {save_filepath}")
+
+
+# functions
+
+
+def api_endpoint_from_url(request_url):
+ """Extract the API endpoint from the request URL."""
+ match = re.search("^https://[^/]+/v\\d+/(.+)$", request_url)
+ if match is None:
+ # for Azure OpenAI deployment urls
+ match = re.search(
+ r"^https://[^/]+/openai/deployments/[^/]+/(.+?)(\?|$)", request_url
+ )
+ return match[1]
+
+
+def append_to_jsonl(data, filename: str) -> None:
+ """Append a json payload to the end of a jsonl file."""
+ json_string = json.dumps(data)
+ with open(filename, "a") as f:
+ f.write(json_string + "\n")
+
+
+def num_tokens_consumed_from_request(
+ request_json: dict,
+ api_endpoint: str,
+ token_encoding_name: str,
+):
+ """Count the number of tokens in the request. Only supports completion and embedding requests."""
+ encoding = tiktoken.get_encoding(token_encoding_name)
+ # if completions request, tokens = prompt + n * max_tokens
+ if api_endpoint.endswith("completions"):
+ max_tokens = request_json.get("max_tokens", 15)
+ n = request_json.get("n", 1)
+ completion_tokens = n * max_tokens
+
+ # chat completions
+ if api_endpoint.startswith("chat/"):
+ num_tokens = 0
+ for message in request_json["messages"]:
+ num_tokens += 4 # every message follows {role/name}\n{content}\n
+ for key, value in message.items():
+ num_tokens += len(encoding.encode(value))
+ if key == "name": # if there's a name, the role is omitted
+ num_tokens -= (
+ 1 # role is always required and always 1 token
+ )
+ num_tokens += 2 # every reply is primed with assistant
+ return num_tokens + completion_tokens
+ # normal completions
+ else:
+ prompt = request_json["prompt"]
+ if isinstance(prompt, str): # single prompt
+ prompt_tokens = len(encoding.encode(prompt))
+ num_tokens = prompt_tokens + completion_tokens
+ return num_tokens
+ elif isinstance(prompt, list): # multiple prompts
+ prompt_tokens = sum([len(encoding.encode(p)) for p in prompt])
+ num_tokens = prompt_tokens + completion_tokens * len(prompt)
+ return num_tokens
+ else:
+ raise TypeError(
+ 'Expecting either string or list of strings for "prompt" field in completion request'
+ )
+ # if embeddings request, tokens = input tokens
+ elif api_endpoint == "embeddings":
+ input = request_json["input"]
+ if isinstance(input, str): # single input
+ num_tokens = len(encoding.encode(input))
+ return num_tokens
+ elif isinstance(input, list): # multiple inputs
+ num_tokens = sum([len(encoding.encode(i)) for i in input])
+ return num_tokens
+ else:
+ raise TypeError(
+ 'Expecting either string or list of strings for "inputs" field in embedding request'
+ )
+ # more logic needed to support other API calls (e.g., edits, inserts, DALL-E)
+ else:
+ raise NotImplementedError(
+ f'API endpoint "{api_endpoint}" not implemented in this script'
+ )
+
+
+def task_id_generator_function():
+ """Generate integers 0, 1, 2, and so on."""
+ task_id = 0
+ while True:
+ yield task_id
+ task_id += 1
+
+
+# run script
+
+
+if __name__ == "__main__":
+ # parse command line arguments
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--requests_filepath")
+ parser.add_argument("--save_filepath", default=None)
+ parser.add_argument(
+ "--request_url", default="https://api.openai.com/v1/chat/completions"
+ )
+ parser.add_argument("--api_key", default=os.getenv("OPENAI_API_KEY"))
+ parser.add_argument(
+ "--max_requests_per_minute", type=int, default=3_000 * 0.5
+ )
+ parser.add_argument(
+ "--max_tokens_per_minute", type=int, default=250_000 * 0.5
+ )
+ parser.add_argument("--token_encoding_name", default="cl100k_base")
+ parser.add_argument("--max_attempts", type=int, default=5)
+ parser.add_argument("--logging_level", default=logging.INFO)
+ args = parser.parse_args()
+
+ if args.save_filepath is None:
+ args.save_filepath = args.requests_filepath.replace(
+ ".jsonl", "_results.jsonl"
+ )
+
+ # run script
+ asyncio.run(
+ process_api_requests_from_file(
+ requests_filepath=args.requests_filepath,
+ save_filepath=args.save_filepath,
+ request_url=args.request_url,
+ api_key=args.api_key,
+ max_requests_per_minute=float(args.max_requests_per_minute),
+ max_tokens_per_minute=float(args.max_tokens_per_minute),
+ token_encoding_name=args.token_encoding_name,
+ max_attempts=int(args.max_attempts),
+ logging_level=int(args.logging_level),
+ )
+ )
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 0000000..8bf22a2
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,49 @@
+import os
+import json
+def calc_openai_cost(response_file: str) -> float:
+ in_tok_tot = 0
+ out_tok_tot = 0
+
+ errors = 0
+ if not os.path.exists(response_file):
+ print(f"{response_file} does not exist")
+ return 0.0
+
+ with open(response_file, "r") as f:
+ model_checked = False
+ for l_idx, line in enumerate(f):
+ cur_request = json.loads(line)
+ if not model_checked:
+ model = cur_request[0]["model"]
+ match model:
+ case "gpt-3.5-turbo" | "gpt-3.5-turbo-1106":
+ in_cost = 0.001
+ out_cost = 0.002
+ case (
+ "gpt-4-1106-preview" | "vijay-gpt-4" | "gpt-4-turbo-2024-04-09"
+ ):
+ in_cost = 0.01
+ out_cost = 0.03
+ case "gpt-4":
+ in_cost = 0.03
+ out_cost = 0.06
+ case "gpt-4o":
+ in_cost = 0.005
+ out_cost = 0.015
+ case _:
+ raise ValueError(f"Unknown model: {model}")
+ model_checked = True
+ try:
+ in_tok_tot += cur_request[1]["usage"]["prompt_tokens"]
+ out_tok_tot += cur_request[1]["usage"]["completion_tokens"]
+ except TypeError:
+ errors += 1
+
+ # calc the cost
+ cost = in_tok_tot / 1000 * in_cost + out_tok_tot / 1000 * out_cost
+ print(f"Input cost: {in_cost}, Output cost: {out_cost}")
+ print(
+ f"Input cost: {in_tok_tot / 1000 * in_cost}, Output cost: {out_tok_tot / 1000 * out_cost}"
+ )
+ print(f"Errors: {errors}")
+ return cost
\ No newline at end of file