From c011d3cd581c244f04cf55308745c3b2bb8ec67b Mon Sep 17 00:00:00 2001 From: tomaioo Date: Sun, 21 Jun 2026 23:27:09 -0700 Subject: [PATCH] fix(ui): poor error handling ux in async dispatcher In `async_utils.py`, the `wait_all` function cancels pending tasks on timeout but uses a generic `asyncio.TimeoutError` without context about which tasks timed out or how many were pending. This provides poor UX for debugging distributed training failures. Signed-off-by: tomaioo <203048277+tomaioo@users.noreply.github.com> --- skyrl-agent/skyrl_agent/dispatcher/async_utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skyrl-agent/skyrl_agent/dispatcher/async_utils.py b/skyrl-agent/skyrl_agent/dispatcher/async_utils.py index 5c5856b27f..111a33d0d3 100644 --- a/skyrl-agent/skyrl_agent/dispatcher/async_utils.py +++ b/skyrl-agent/skyrl_agent/dispatcher/async_utils.py @@ -68,7 +68,11 @@ async def wait_all(iterable: Iterable[Coroutine], timeout: int = GENERAL_TIMEOUT if pending: for task in pending: task.cancel() - raise asyncio.TimeoutError() + pending_ids = [task.get_name() for task in pending] + raise asyncio.TimeoutError( + f"Timeout waiting for {len(pending)} pending task(s) out of {len(tasks)} total. " + f"Pending task IDs: {pending_ids}" + ) results = [] errors = [] for task in tasks: