AgentR1
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎claw_r1/async_rollouter.py‎
Lines changed: 17 additions & 2 deletions b/‎claw_r1/async_rollouter.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎claw_r1/async_trainer.py‎
Lines changed: 1 addition & 0 deletions b/‎claw_r1/async_trainer.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎claw_r1/blackbox_agent/__init__.py‎ b/‎claw_r1/blackbox_agent/__init__.py‎
diff --git a/‎claw_r1/blackbox_agent/agent_flow_config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎claw_r1/blackbox_agent/agent_flow_config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎claw_r1/blackbox_agent/blackbox_agent_flow.py‎
Lines changed: 96 additions & 0 deletions b/‎claw_r1/blackbox_agent/blackbox_agent_flow.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎claw_r1/blackbox_agent/gsm8k_agent.py‎
Lines changed: 135 additions & 0 deletions b/‎claw_r1/blackbox_agent/gsm8k_agent.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎claw_r1/blackbox_agent/gsm8k_agent_flow.py‎
Lines changed: 36 additions & 0 deletions b/‎claw_r1/blackbox_agent/gsm8k_agent_flow.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎claw_r1/data_pool/training_backend.py‎
Lines changed: 6 additions & 2 deletions b/‎claw_r1/data_pool/training_backend.py‎
Lines changed: 6 additions & 2 deletions
@@ -11,7 +11,9 @@
 
 ## News
 
-- **[2026.03]** 🚧 **Claw-R1 Project Init.** We are actively updating the framework. Stay tuned for more features and documentation.
+- **[2026.03.06]** 📖 **Claw-R1 Documentation Released.** Project page and documentation are now available at [Claw-R1 Project Page](https://agentr1.github.io/) and [Claw-R1 docs](https://agentr1.github.io/Claw-R1/).
+
+- **[2026.03.03]** 🚧 **Claw-R1 Project Init.** We are actively updating the framework. Stay tuned for more features and documentation.
 
 ## Overview
 
 
@@ -264,11 +264,22 @@ def _init_gateway(self):
             str(gateway_port),
         ]
 
-        self._gateway_process = subprocess.Popen(cmd)
+        self._gateway_process = subprocess.Popen(
+            cmd,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
         self._gateway_url = f"http://localhost:{gateway_port}"
         atexit.register(self._stop_gateway)
 
         for _ in range(120):
+            if self._gateway_process.poll() is not None:
+                _, err = self._gateway_process.communicate()
+                err = (err or "").strip() or "(no stderr)"
+                raise RuntimeError(
+                    f"Gateway process exited before ready ({self._gateway_url}). stderr:\n{err}"
+                )
             try:
                 resp = httpx.get(f"{self._gateway_url}/docs", timeout=2.0)
                 if resp.status_code == 200:
@@ -277,7 +288,11 @@ def _init_gateway(self):
             except Exception:
                 pass
             time.sleep(1)
-        raise RuntimeError(f"Gateway did not start within 120s ({self._gateway_url})")
+        raise RuntimeError(
+            f"Gateway did not start within 120s ({self._gateway_url}). "
+            "Check that port %s is free and no firewall blocks it."
+            % gateway_port
+        )
 
     def _stop_gateway(self):
         proc = getattr(self, "_gateway_process", None)
 
@@ -270,6 +270,7 @@ def fit(self):
     def _process_batch(self, batch: DataProto, metrics: dict, timing_raw: dict) -> DataProto:
         """Run the full PPO pipeline on a single batch."""
         batch.meta_info["global_token_num"] = batch.batch["attention_mask"].sum(dim=-1).tolist()
+        batch.meta_info.setdefault("temperature", self.config.actor_rollout_ref.rollout.temperature)
 
         if "response_mask" not in batch.batch:
             batch.batch["response_mask"] = compute_response_mask(batch)
 
@@ -0,0 +1,2 @@
+- name: blackbox_gsm8k_agent
+  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow
@@ -0,0 +1,96 @@
+"""Black-box agent flow — base class.
+
+BlackBoxAgentFlowBase handles the full protocol with Gateway (init_trajectory,
+register_trajectory, complete) and delegates agent execution to subclasses via
+_run_agent.  Subclasses only create and run the concrete Agent; they do not
+touch Gateway or implement any task logic.  Concrete strategies live in
+separate modules (e.g. gsm8k_agent_flow.py).
+"""
+
+import json
+import logging
+import os
+from abc import abstractmethod
+from typing import Any
+
+import httpx
+import numpy as np
+
+from claw_r1.agent_flow.agent_flow import AgentFlowBase, register
+
+logger = logging.getLogger(__name__)
+logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
+
+_DEFAULT_SKIP_KEYS = frozenset({"raw_prompt", "multi_modal_data", "channel", "agent_name"})
+
+
+class _NumpyEncoder(json.JSONEncoder):
+    """JSON encoder that converts numpy scalars to native Python types for HTTP requests."""
+
+    def default(self, o):
+        if isinstance(o, np.integer):
+            return int(o)
+        if isinstance(o, np.floating):
+            return float(o)
+        if isinstance(o, np.ndarray):
+            return o.tolist()
+        return super().default(o)
+
+
+class BlackBoxAgentFlowBase(AgentFlowBase):
+    """Base class for black-box agent flows.
+
+    Handles generic parameter processing and the full Gateway protocol:
+    init_trajectory (get base_url) -> register_trajectory (channel + metadata)
+    -> call subclass _run_agent -> complete.  Subclasses only implement
+    _run_agent to create and run the concrete Agent.
+    """
+
+    def _prepare_params(self, kwargs: dict[str, Any]) -> tuple[str | None, str, dict[str, Any]]:
+        """Extract channel, prompt_uid, and metadata from kwargs."""
+        channel = kwargs.pop("channel", None)
+        prompt_uid = str(kwargs.get("uid", "1"))
+        metadata = {k: v for k, v in kwargs.items() if k not in _DEFAULT_SKIP_KEYS}
+        return channel, prompt_uid, metadata
+
+    async def run(self, sampling_params: dict[str, Any], **kwargs) -> int:
+        channel, prompt_uid, metadata = self._prepare_params(kwargs)
+
+        async with httpx.AsyncClient(timeout=30.0) as http:
+            # 1. Allocate trajectory — get base_url with trajectory_uid embedded.
+            init_resp = await http.post(f"{self.gateway_url}/init_trajectory")
+            init_resp.raise_for_status()
+            init_data = init_resp.json()
+            base_url_from_init = init_data["base_url"]
+            # base_url_from_init is http://host:port/{traj_uid}/{default_prompt_uid}/v1
+            # Replace the default prompt_uid with the actual one.
+            parts = base_url_from_init.rsplit("/", 2)  # [...base, prompt_uid, "v1"]
+            base_url = f"{parts[0]}/{prompt_uid}/v1"
+
+            # 2. Register channel + metadata via base_url.
+            reg_body: dict[str, Any] = {}
+            if channel:
+                reg_body["channel"] = channel
+            if metadata:
+                reg_body["metadata"] = metadata
+            payload = json.dumps(reg_body, cls=_NumpyEncoder).encode()
+            await http.post(
+                f"{base_url}/register_trajectory",
+                content=payload,
+                headers={"content-type": "application/json"},
+            )
+
+        # 3. Run the concrete agent.
+        try:
+            num_turns = await self._run_agent(base_url, kwargs)
+        finally:
+            # 4. Mark trajectory complete.
+            async with httpx.AsyncClient(timeout=httpx.Timeout(600.0)) as http:
+                await http.post(f"{base_url}/complete_trajectory")
+
+        return num_turns
+
+    @abstractmethod
+    async def _run_agent(self, base_url: str, kwargs: dict[str, Any]) -> int:
+        """Create and run the concrete Agent.  Subclasses implement this."""
+        raise NotImplementedError
@@ -0,0 +1,135 @@
+"""GSM8K black-box agent — fully independent of training internals.
+
+This agent uses a standard OpenAI-compatible API to interact with the LLM,
+parses tool calls from raw text output (Qwen-style ``<tool_call>`` tags),
+and executes a local ``check_answer`` tool.
+
+It knows nothing about trajectory UIDs, Steps, DataPool, or reward — all of
+those are transparently handled by the Gateway.
+"""
+
+import json
+import logging
+
+import regex
+
+logger = logging.getLogger(__name__)
+
+CHECK_ANSWER_TOOL = {
+    "type": "function",
+    "function": {
+        "name": "check_answer",
+        "description": "Check if your answer to the math problem is correct.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "answer": {
+                    "type": "string",
+                    "description": "Your final numerical answer",
+                }
+            },
+            "required": ["answer"],
+        },
+    },
+}
+
+TOOL_CALL_REGEX = regex.compile(r"<tool_call>(.*?)</tool_call>", regex.DOTALL)
+
+
+def parse_tool_calls(content: str) -> tuple[str, list[dict]]:
+    """Extract ``<tool_call>`` blocks from raw LLM output.
+
+    Mirrors the parsing logic of verl's ``HermesToolParser``.
+
+    Returns:
+        (remaining_text, list_of_tool_calls) where each tool call is a dict
+        with ``name`` and ``arguments`` keys.
+    """
+    if "<tool_call>" not in content:
+        return content, []
+
+    matches = TOOL_CALL_REGEX.findall(content)
+    tool_calls = []
+    for match in matches:
+        try:
+            parsed = json.loads(match)
+            if not isinstance(parsed, dict):
+                continue
+            tool_calls.append({"name": parsed["name"], "arguments": parsed["arguments"]})
+        except (json.JSONDecodeError, KeyError, TypeError):
+            pass
+
+    remaining = TOOL_CALL_REGEX.sub("", content).strip()
+    return remaining, tool_calls
+
+
+def check_answer(answer: str, ground_truth: str) -> str:
+    """Run local answer verification, returning textual feedback only."""
+    from verl.utils.reward_score.gsm8k import compute_score
+
+    score = compute_score(
+        f"#### {answer}",
+        ground_truth,
+        method="flexible",
+        format_score=0.0,
+        score=1.0,
+    )
+    if score > 0:
+        return "Correct! Your answer is right."
+    return "Incorrect. Your answer is wrong, please try again."
+
+
+class GSM8KAgent:
+    """Stateless GSM8K solving agent that talks to an OpenAI-compatible API.
+
+    The agent is completely unaware of training-side concepts such as
+    ``trajectory_uid``, ``Step``, or ``DataPool``.  All it needs is a
+    ``base_url`` pointing to an OpenAI-compatible endpoint.
+
+    Args:
+        base_url: Root URL for the API, e.g. ``http://host:port/{traj}/{prompt}``.
+            The OpenAI SDK client will use ``{base_url}/v1`` as its base.
+    """
+
+    def __init__(self, base_url: str):
+        import openai
+
+        self.base_url = base_url.rstrip("/")
+        self.client = openai.AsyncOpenAI(
+            base_url=self.base_url,
+            api_key="not-needed",
+            timeout=600.0,
+        )
+
+    async def solve(self, question: str, ground_truth: str, max_turns: int = 3) -> int:
+        """Attempt to solve *question* in up to *max_turns* LLM interactions.
+
+        Returns the number of turns actually used.  Trajectory completion is
+        signaled by the caller (BlackBoxAgentFlowBase or online service entrypoint).
+        """
+        messages: list[dict] = [{"role": "user", "content": question}]
+
+        turns_used = 0
+        for turn in range(max_turns):
+            turns_used = turn + 1
+
+            resp = await self.client.chat.completions.create(
+                model="default",
+                messages=messages,
+                tools=[CHECK_ANSWER_TOOL],
+            )
+            content = resp.choices[0].message.content or ""
+            _, tool_calls = parse_tool_calls(content)
+
+            if tool_calls:
+                messages.append({"role": "assistant", "content": content})
+                for tc in tool_calls:
+                    if tc["name"] == "check_answer":
+                        answer = tc["arguments"].get("answer", "")
+                        result = check_answer(answer, ground_truth)
+                        messages.append({"role": "tool", "content": result})
+            else:
+                messages.append({"role": "assistant", "content": content})
+                break
+
+        return turns_used
@@ -0,0 +1,36 @@
+"""GSM8K black-box agent flow — concrete strategy for GSM8K."""
+
+from typing import Any
+
+from claw_r1.agent_flow.agent_flow import register
+
+from claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase
+
+from claw_r1.blackbox_agent.gsm8k_agent import GSM8KAgent
+
+
+@register("blackbox_gsm8k_agent")
+class BlackBoxGSM8KAgentFlow(BlackBoxAgentFlowBase):
+    """Black-box flow that delegates to :class:`GSM8KAgent`."""
+
+    async def _run_agent(self, base_url: str, kwargs: dict[str, Any]) -> int:
+        raw_prompt = kwargs.get("raw_prompt", [])
+        if isinstance(raw_prompt, list) and raw_prompt:
+            question = next(
+                (m.get("content", "") for m in reversed(raw_prompt) if m.get("role") == "user"),
+                str(raw_prompt),
+            ) or str(raw_prompt)
+        elif isinstance(raw_prompt, str):
+            question = raw_prompt
+        else:
+            question = str(raw_prompt)
+
+        reward_model = kwargs.get("reward_model", {})
+        if isinstance(reward_model, dict):
+            ground_truth = str(reward_model.get("ground_truth", ""))
+        else:
+            ground_truth = str(getattr(reward_model, "ground_truth", ""))
+
+        max_turns = self.config.actor_rollout_ref.rollout.get("max_turns", 3)
+        agent = GSM8KAgent(base_url=base_url)
+        return await agent.solve(question=question, ground_truth=ground_truth, max_turns=max_turns)
@@ -135,9 +135,12 @@ def _pad_single_step(self, step: Step) -> dict[str, Any]:
 
         Returns a dict of tensors, each with a leading batch dim of 1.
         """
+        pad_token_id = self._tokenizer.pad_token_id or 0
+
         self._tokenizer.padding_side = "left"
+        prompt_ids = step.prompt_ids if step.prompt_ids else [pad_token_id]
         prompt_out = self._tokenizer.pad(
-            {"input_ids": step.prompt_ids},
+            {"input_ids": prompt_ids},
             padding="max_length",
             max_length=self._prompt_length,
             return_tensors="pt",
@@ -148,8 +151,9 @@ def _pad_single_step(self, step: Step) -> dict[str, Any]:
             prompt_out["attention_mask"] = prompt_out["attention_mask"].unsqueeze(0)
 
         self._tokenizer.padding_side = "right"
+        response_ids = step.response_ids if step.response_ids else [pad_token_id]
         response_out = self._tokenizer.pad(
-            {"input_ids": step.response_ids},
+            {"input_ids": response_ids},
             padding="max_length",
             max_length=self._response_length,
             return_tensors="pt",
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+- name: blackbox_gsm8k_agent`
	`2`	`+ _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow`