From 08955705e40237c946ff70f76d3f96438ad2948a Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Sun, 24 May 2026 11:02:05 -0600 Subject: [PATCH] feat(0.19.0): driven-loop kernel (sandbox-SDK-based) + coderProfile MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0 of the driven-loop substrate. Ships: - `@tangle-network/agent-runtime/loops` — `runLoop` kernel + Refine and FanoutVote drivers, built on the sandbox SDK's `AgentProfile` + `streamPrompt` contract. The kernel orchestrates around the sandbox SDK; it does not invent its own notion of "what an agent is". - `@tangle-network/agent-runtime/profiles` — `coderProfile` + `multiHarnessCoderFanout`. Bundle an `AgentProfile`, task-to-prompt formatter, output adapter, and per-task validator (forbidden paths, diff cap, tests + typecheck) into a runLoop-ready unit. Layering: sandbox SDK AgentProfile + Sandbox + streamPrompt agent-runtime/loops runLoop kernel + drivers agent-runtime/profiles presets (coder; researcher in Phase 1) agent-runtime existing UNTOUCHED — runAgentTask, RuntimeRunHandle etc Kernel responsibilities: iteration accounting, parallel execution bounded by `maxConcurrency`, abort propagation, cost aggregation from sandbox `llm_call`-shaped events (with optional `runHandle.observe` forwarding), and trace emission via `LoopTraceEmitter`. Driver responsibilities: topology only. Refine returns `[task]` until the validator passes; FanoutVote returns N copies on iteration 0 then selects the highest-scoring valid output. Drivers receive a read-only history and a typed decision channel; the kernel terminates on `'stop' | 'pick-winner' | 'fail' | 'done'`. Output adapter parses an event array → typed Output. Validator scores the typed Output → DefaultVerdict. Both are pure functions; tests exercise them without a real sandbox. Heterogeneous fanout is built in: pass `agentRuns: AgentRunSpec[]` and the kernel round-robins through them when the driver plans N tasks. `multiHarnessCoderFanout` ships a 3-harness default (claude-code, codex, opencode/zai-coding-plan/glm-5.1). Tests (25 new, all 154 pass): - tests/loops/refine.test.ts (7) — refine-until-valid, maxIter cap, error capture, trace event ordering, cost aggregation - tests/loops/fanout-vote.test.ts (6) — winner selection, fail mode, `maxConcurrency` enforcement, heterogeneous agentRuns, error handling on missing options - tests/loops/composition.test.ts (2) — recursive runLoop in Driver.plan; static typecheck of nested kernel calls - tests/profiles/coder.test.ts (10) — task-bound validator (forbidden-path, diff cap, tests, typecheck), score math, output adapter (structured result + fenced-JSON fallback), multi-harness fanout shape Build, typecheck, lint clean. Existing 129 tests untouched. Smoke test (manual; requires sandbox credentials): cd /home/drew/code/agent-runtime && pnpm build TANGLE_SANDBOX_API_KEY=... TANGLE_ORCHESTRATOR_URL=... node -e " import { Sandbox } from '@tangle-network/sandbox' import { runLoop, createFanoutVoteDriver } from './dist/loops.js' import { multiHarnessCoderFanout } from './dist/profiles.js' const client = new Sandbox({ apiKey: process.env.TANGLE_SANDBOX_API_KEY, baseUrl: process.env.TANGLE_ORCHESTRATOR_URL }) const { agentRuns, output, validator, driver } = multiHarnessCoderFanout() const result = await runLoop({ driver, agentRuns, output, validator, task: { goal: 'add a hello function', repoRoot: '/work/repo' }, ctx: { sandboxClient: client }, }) console.log(result.decision, result.winner?.iterationIndex, result.costUsd) " Out of scope (Phase 1+): researcherProfile, sandboxedDriver helper, MCP wrapper, Council/Decompose/Pipeline topologies, agent-eval refactor. --- package.json | 12 +- src/loops/drivers/fanout-vote.ts | 102 ++++++ src/loops/drivers/refine.ts | 79 +++++ src/loops/index.ts | 49 +++ src/loops/run-loop.ts | 534 +++++++++++++++++++++++++++++++ src/loops/trace.ts | 22 ++ src/loops/types.ts | 235 ++++++++++++++ src/profiles/coder.ts | 398 +++++++++++++++++++++++ src/profiles/index.ts | 16 + tests/loops/composition.test.ts | 179 +++++++++++ tests/loops/fanout-vote.test.ts | 281 ++++++++++++++++ tests/loops/refine.test.ts | 283 ++++++++++++++++ tests/profiles/coder.test.ts | 186 +++++++++++ tsup.config.ts | 2 + 14 files changed, 2377 insertions(+), 1 deletion(-) create mode 100644 src/loops/drivers/fanout-vote.ts create mode 100644 src/loops/drivers/refine.ts create mode 100644 src/loops/index.ts create mode 100644 src/loops/run-loop.ts create mode 100644 src/loops/trace.ts create mode 100644 src/loops/types.ts create mode 100644 src/profiles/coder.ts create mode 100644 src/profiles/index.ts create mode 100644 tests/loops/composition.test.ts create mode 100644 tests/loops/fanout-vote.test.ts create mode 100644 tests/loops/refine.test.ts create mode 100644 tests/profiles/coder.test.ts diff --git a/package.json b/package.json index 28fbdde..1d44356 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.18.0", + "version": "0.19.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { @@ -33,6 +33,16 @@ "types": "./dist/agent.d.ts", "import": "./dist/agent.js", "default": "./dist/agent.js" + }, + "./loops": { + "types": "./dist/loops.d.ts", + "import": "./dist/loops.js", + "default": "./dist/loops.js" + }, + "./profiles": { + "types": "./dist/profiles.d.ts", + "import": "./dist/profiles.js", + "default": "./dist/profiles.js" } }, "files": [ diff --git a/src/loops/drivers/fanout-vote.ts b/src/loops/drivers/fanout-vote.ts new file mode 100644 index 0000000..9139499 --- /dev/null +++ b/src/loops/drivers/fanout-vote.ts @@ -0,0 +1,102 @@ +/** + * @experimental + * + * FanoutVote driver — N parallel attempts in iteration 0, pick the highest- + * scoring valid output. No second iteration: the topology is "spawn N, score, + * pick winner". The kernel handles heterogeneous fanout via the + * `agentRuns: AgentRunSpec[]` form on `runLoop`. + */ + +import { ValidationError } from '../../errors' +import type { DefaultVerdict, Driver, Iteration } from '../types' + +export type FanoutVoteDecision = 'pick-winner' | 'fail' + +/** @experimental */ +export interface FanoutVoteScored { + task: Task + output: Output + verdict?: DefaultVerdict + iterationIndex: number + agentRunName: string +} + +/** @experimental */ +export interface CreateFanoutVoteDriverOptions { + /** Number of parallel attempts. Must be >= 1. */ + n: number + /** + * Pick the winner from the scored set. Default: highest `verdict.score` + * among valid outputs (ties broken by smallest iteration index). When + * no valid outputs exist, returns `undefined` and `decide()` resolves + * to `'fail'`. The kernel still records winners structurally — this + * selector only feeds `decide()`'s pass/fail signal. + */ + selector?: ( + scored: FanoutVoteScored[], + ) => FanoutVoteScored | undefined + /** Stable identifier surfaced in trace events. Default `'fanout-vote'`. */ + name?: string +} + +/** @experimental */ +export function createFanoutVoteDriver( + options: CreateFanoutVoteDriverOptions, +): Driver { + if (!Number.isFinite(options.n) || options.n < 1) { + throw new ValidationError(`createFanoutVoteDriver: n must be >= 1, got ${options.n}`) + } + const selector = options.selector ?? defaultSelector + return { + name: options.name ?? 'fanout-vote', + async plan(task, history) { + if (history.length === 0) return Array.from({ length: options.n }, () => task) + return [] + }, + decide(history) { + const scored = scoreIterations(history) + return selector(scored) ? 'pick-winner' : 'fail' + }, + } +} + +function defaultSelector( + scored: FanoutVoteScored[], +): FanoutVoteScored | undefined { + const valid = scored.filter((entry) => entry.verdict?.valid === true) + if (valid.length === 0) return undefined + return [...valid].sort( + (a, b) => + (b.verdict?.score ?? 0) - (a.verdict?.score ?? 0) || a.iterationIndex - b.iterationIndex, + )[0] +} + +function scoreIterations( + iterations: ReadonlyArray>, +): FanoutVoteScored[] { + const out: FanoutVoteScored[] = [] + for (const iter of iterations) { + if (iter.output === undefined || iter.error) continue + out.push({ + task: iter.task, + output: iter.output, + verdict: iter.verdict, + iterationIndex: iter.index, + agentRunName: iter.agentRunName, + }) + } + return out +} + +/** + * Test helper: surface the per-iteration scored view a custom `selector` + * would receive. Exposed so consumers writing a custom selector can test it + * standalone without driving the full kernel. + * + * @experimental + */ +export function scoreFanoutVoteIterations( + iterations: ReadonlyArray>, +): FanoutVoteScored[] { + return scoreIterations(iterations) +} diff --git a/src/loops/drivers/refine.ts b/src/loops/drivers/refine.ts new file mode 100644 index 0000000..b069b7e --- /dev/null +++ b/src/loops/drivers/refine.ts @@ -0,0 +1,79 @@ +/** + * @experimental + * + * Refine driver — single task per iteration, validator-gated. + * + * `plan` returns `[task]` (possibly transformed via `refineTask`) until the + * prior verdict is valid OR the local cap is hit, then `[]`. + * `decide` returns `'stop'` once the latest verdict is valid OR the cap is + * reached. The kernel's `maxIterations` is an orthogonal safety cap; + * whichever is lower wins. + */ + +import { ValidationError } from '../../errors' +import type { DefaultVerdict, Driver, Iteration } from '../types' + +export type RefineDecision = 'continue' | 'stop' + +/** @experimental */ +export interface CreateRefineDriverOptions { + /** Hard cap on iterations. Default 5. */ + maxIterations?: number + /** + * Optional task transform applied each round based on the prior verdict. + * When omitted, the same task is replayed and the agent is expected to + * inspect the sandbox session state for prior attempts. + */ + refineTask?: (task: Task, prior: DefaultVerdict) => Task + /** Stable identifier surfaced in trace events. Default `'refine'`. */ + name?: string +} + +/** @experimental */ +export function createRefineDriver( + options: CreateRefineDriverOptions = {}, +): Driver { + const maxIterations = options.maxIterations ?? 5 + if (!Number.isFinite(maxIterations) || maxIterations <= 0) { + throw new ValidationError('createRefineDriver: maxIterations must be > 0') + } + const refineTask = options.refineTask + return { + name: options.name ?? 'refine', + async plan(task, history) { + if (history.length >= maxIterations) return [] + if (history.length === 0) return [task] + const prior = history.at(-1) + if (!prior) return [task] + if (prior.verdict?.valid === true) return [] + // Worker error: replay the same task so the agent can self-correct. + // The driver has no signal beyond `verdict`; only the validator + // controls "good enough". + if (!refineTask || !prior.verdict) return [prior.task] + return [refineTask(prior.task, prior.verdict)] + }, + decide(history) { + const last = history.at(-1) + if (!last) return 'continue' + if (last.verdict?.valid === true) return 'stop' + if (history.length >= maxIterations) return 'stop' + return 'continue' + }, + } +} + +/** + * Test helper: select the last-valid iteration (or the last attempt if + * none passed). Mirrors the kernel's default selector ordering for refine + * topologies — the most recent successful attempt wins. + * + * @experimental + */ +export function refineWinnerIndex( + iterations: ReadonlyArray>, +): number | undefined { + for (let i = iterations.length - 1; i >= 0; i -= 1) { + if (iterations[i]?.verdict?.valid) return i + } + return iterations.length > 0 ? iterations.length - 1 : undefined +} diff --git a/src/loops/index.ts b/src/loops/index.ts new file mode 100644 index 0000000..8ace184 --- /dev/null +++ b/src/loops/index.ts @@ -0,0 +1,49 @@ +/** + * @experimental + * + * Driven-loop substrate. `runLoop` orchestrates around the sandbox SDK; it + * does not invent its own notion of "what an agent is". Each iteration is + * a `sandboxClient.create({ backend: { profile } })` + `box.streamPrompt` + * call. The driver owns topology; the validator owns scoring; the output + * adapter owns event-stream decode; the kernel owns iteration accounting, + * concurrency, abort, cost aggregation, and trace emission. + */ + +// One-stop import: sandbox-SDK types consumers need to spell out an +// `AgentRunSpec` without importing `@tangle-network/sandbox` separately. +export type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +export type { + CreateFanoutVoteDriverOptions, + FanoutVoteDecision, + FanoutVoteScored, +} from './drivers/fanout-vote' +export { createFanoutVoteDriver, scoreFanoutVoteIterations } from './drivers/fanout-vote' +export type { CreateRefineDriverOptions, RefineDecision } from './drivers/refine' +export { createRefineDriver, refineWinnerIndex } from './drivers/refine' +export type { RunLoopOptions } from './run-loop' +export { runLoop } from './run-loop' +export type { + AgentRunSpec, + DefaultVerdict, + Driver, + ExecCtx, + Iteration, + LoopDecisionPayload, + LoopEndedPayload, + LoopIterationEndedPayload, + LoopIterationStartedPayload, + LoopResult, + LoopSandboxClient, + LoopStartedPayload, + LoopTraceEmitter, + LoopTraceEvent, + LoopWinner, + OutputAdapter, + ValidationCtx, + Validator, +} from './types' diff --git a/src/loops/run-loop.ts b/src/loops/run-loop.ts new file mode 100644 index 0000000..d359942 --- /dev/null +++ b/src/loops/run-loop.ts @@ -0,0 +1,534 @@ +/** + * @experimental + * + * `runLoop` — the topology-agnostic kernel built atop the sandbox SDK. + * + * Each iteration: + * 1. `driver.plan(task, history)` → N tasks (1 = refine, N = fanout, 0 = stop) + * 2. For each task (parallel, bounded by `maxConcurrency`): + * a. round-robin an `AgentRunSpec` from `agentRuns` + * b. `sandboxClient.create({ backend: { profile }, ...overrides })` + * c. iterate `box.streamPrompt(taskToPrompt(task))` and collect events + * 3. `output.parse(events)` → typed `Output` + * 4. `validator?.validate(output)` → `DefaultVerdict` + * 5. Append `Iteration` to history; emit `loop.iteration.ended` + * 6. `driver.decide(history)` → if terminal, return result + winner + * + * The kernel owns: iteration accounting, per-iteration timing, error + * capture, abort propagation, concurrency cap, cost aggregation, and trace + * emission. The kernel does NOT own: what the agent runs (sandbox SDK + + * profile), how outputs are decoded (output adapter), how outputs are + * scored (validator), or topology (driver). + */ + +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import { ValidationError } from '../errors' +import type { RuntimeStreamEvent } from '../types' +import type { + AgentRunSpec, + Driver, + ExecCtx, + Iteration, + LoopResult, + LoopSandboxClient, + LoopTraceEmitter, + LoopTraceEvent, + LoopWinner, + OutputAdapter, + Validator, +} from './types' + +const DEFAULT_MAX_ITERATIONS = 10 +const DEFAULT_MAX_CONCURRENCY = 4 + +/** @experimental */ +export interface RunLoopOptions { + driver: Driver + /** + * Single agent spec — every iteration uses this profile. Mutually + * exclusive with `agentRuns`. + */ + agentRun?: AgentRunSpec + /** + * Multiple specs for heterogeneous fanout. The kernel round-robins + * through them when the driver plans N tasks. Mutually exclusive with + * `agentRun`. + */ + agentRuns?: AgentRunSpec[] + output: OutputAdapter + validator?: Validator + task: Task + ctx: ExecCtx + /** Default 10. Hard cap on total iterations across all `plan()` rounds. */ + maxIterations?: number + /** Default 4. In-flight worker cap within a single `plan()` batch. */ + maxConcurrency?: number + /** + * Pre-allocated id for trace correlation. Default = `loop-${random}`. + * Surfaces as `runId` on every emitted `LoopTraceEvent`. + */ + runId?: string + /** + * Clock override; default `Date.now`. Deterministic tests pass a + * monotonic counter to stabilize iteration timing fields. + */ + now?: () => number + /** + * Override the default winner selector (highest-valid-score, ties broken + * by earliest iteration). + */ + selectWinner?: (iterations: Iteration[]) => LoopWinner | undefined +} + +/** @experimental */ +export async function runLoop( + options: RunLoopOptions, +): Promise> { + const specs = resolveAgentRuns(options) + const maxIterations = options.maxIterations ?? DEFAULT_MAX_ITERATIONS + if (!Number.isFinite(maxIterations) || maxIterations <= 0) { + throw new ValidationError('runLoop: maxIterations must be > 0') + } + const maxConcurrency = options.maxConcurrency ?? DEFAULT_MAX_CONCURRENCY + if (!Number.isFinite(maxConcurrency) || maxConcurrency <= 0) { + throw new ValidationError('runLoop: maxConcurrency must be > 0') + } + if (!options.ctx?.sandboxClient || typeof options.ctx.sandboxClient.create !== 'function') { + throw new ValidationError('runLoop: ctx.sandboxClient.create is required') + } + const now = options.now ?? Date.now + const runId = options.runId ?? `loop-${randomSuffix()}` + const loopStart = now() + const driverName = options.driver.name ?? 'driver' + const iterations: Iteration[] = [] + + await emitTrace(options.ctx.traceEmitter, { + kind: 'loop.started', + runId, + timestamp: now(), + payload: { + driver: driverName, + agentRunNames: specs.map((spec) => spec.name ?? spec.profile.name ?? 'agent'), + maxIterations, + maxConcurrency, + }, + }) + + const controller = new AbortController() + const onOuterAbort = () => controller.abort() + if (options.ctx.signal) { + if (options.ctx.signal.aborted) controller.abort() + else options.ctx.signal.addEventListener('abort', onOuterAbort, { once: true }) + } + + try { + while (iterations.length < maxIterations) { + if (controller.signal.aborted) throwAbort() + const planned = await options.driver.plan(options.task, iterations) + if (planned.length === 0) break + + const remaining = maxIterations - iterations.length + const slice = planned.slice(0, remaining) + const baseIndex = iterations.length + // Reserve slots up front so concurrent workers may mutate by index. + for (let i = 0; i < slice.length; i += 1) { + const spec = specs[(baseIndex + i) % specs.length]! + iterations.push({ + index: baseIndex + i, + task: slice[i] as Task, + agentRunName: spec.name ?? spec.profile.name ?? 'agent', + events: [], + startedAt: now(), + endedAt: 0, + costUsd: 0, + }) + } + + await runBatch({ + slice, + baseIndex, + iterations, + specs, + output: options.output, + validator: options.validator, + maxConcurrency, + signal: controller.signal, + ctx: options.ctx, + runId, + now, + }) + + if (controller.signal.aborted) throwAbort() + + const decision = await options.driver.decide(iterations) + await emitTrace(options.ctx.traceEmitter, { + kind: 'loop.decision', + runId, + timestamp: now(), + payload: { decision: serializeDecision(decision), historyLength: iterations.length }, + }) + if (isTerminalDecision(decision)) { + return finalize({ + options, + decision, + iterations, + startMs: loopStart, + now, + runId, + }) + } + } + + if (iterations.length >= maxIterations) { + // Cap reached without a terminal decision — ask the driver one more time + // for its final state, then close out. + const decision = await options.driver.decide(iterations) + await emitTrace(options.ctx.traceEmitter, { + kind: 'loop.decision', + runId, + timestamp: now(), + payload: { decision: serializeDecision(decision), historyLength: iterations.length }, + }) + return finalize({ options, decision, iterations, startMs: loopStart, now, runId }) + } + // `plan()` returned `[]` before `decide()` reached a terminal state. + const decision = await options.driver.decide(iterations) + await emitTrace(options.ctx.traceEmitter, { + kind: 'loop.decision', + runId, + timestamp: now(), + payload: { decision: serializeDecision(decision), historyLength: iterations.length }, + }) + return finalize({ options, decision, iterations, startMs: loopStart, now, runId }) + } finally { + if (options.ctx.signal) options.ctx.signal.removeEventListener('abort', onOuterAbort) + } +} + +interface RunBatchArgs { + slice: Task[] + baseIndex: number + iterations: Iteration[] + specs: AgentRunSpec[] + output: OutputAdapter + validator: Validator | undefined + maxConcurrency: number + signal: AbortSignal + ctx: ExecCtx + runId: string + now: () => number +} + +async function runBatch(args: RunBatchArgs) { + const queue = args.slice.map((task, offset) => ({ task, index: args.baseIndex + offset })) + const inflight = new Set>() + while (queue.length > 0 || inflight.size > 0) { + while (inflight.size < args.maxConcurrency && queue.length > 0) { + const item = queue.shift()! + const p = executeIteration({ ...args, item }).finally(() => inflight.delete(p)) + inflight.add(p) + } + if (inflight.size === 0) break + await Promise.race(inflight) + } +} + +interface ExecuteIterationArgs extends RunBatchArgs { + item: { task: Task; index: number } +} + +async function executeIteration(args: ExecuteIterationArgs) { + const slot = args.iterations[args.item.index] + if (!slot) + throw new ValidationError(`runLoop: missing iteration slot at index ${args.item.index}`) + const spec = args.specs[args.item.index % args.specs.length] + if (!spec) throw new ValidationError('runLoop: no AgentRunSpec available for iteration') + slot.startedAt = args.now() + slot.agentRunName = spec.name ?? spec.profile.name ?? 'agent' + + await emitTrace(args.ctx.traceEmitter, { + kind: 'loop.iteration.started', + runId: args.runId, + timestamp: args.now(), + payload: { + iterationIndex: args.item.index, + agentRunName: slot.agentRunName, + taskHash: hashJson(args.item.task), + }, + }) + + try { + const box = await createSandboxForSpec(args.ctx.sandboxClient, spec, args.signal) + const message = spec.taskToPrompt(args.item.task) + const events: SandboxEvent[] = [] + for await (const event of box.streamPrompt(message, { signal: args.signal })) { + events.push(event) + const llmCall = extractLlmCallEvent(event, slot.agentRunName) + if (llmCall) { + slot.costUsd += llmCall.costUsd ?? 0 + args.ctx.runHandle?.observe(llmCall) + } + } + slot.events = events + slot.output = args.output.parse(events) + if (args.validator) { + slot.verdict = await args.validator.validate(slot.output, { + iteration: args.item.index, + signal: args.signal, + }) + } + } catch (err) { + slot.error = err instanceof Error ? err : new Error(String(err)) + } finally { + slot.endedAt = args.now() + await emitTrace(args.ctx.traceEmitter, { + kind: 'loop.iteration.ended', + runId: args.runId, + timestamp: args.now(), + payload: { + iterationIndex: args.item.index, + agentRunName: slot.agentRunName, + outputHash: slot.output !== undefined ? hashJson(slot.output) : undefined, + verdict: slot.verdict, + error: slot.error?.message, + costUsd: slot.costUsd, + durationMs: slot.endedAt - slot.startedAt, + }, + }) + } +} + +async function createSandboxForSpec( + client: LoopSandboxClient, + spec: AgentRunSpec, + signal: AbortSignal, +): Promise { + const overrides = spec.sandboxOverrides ?? {} + const overrideBackend = overrides.backend + const opts: CreateSandboxOptions = { + ...overrides, + backend: { + type: overrideBackend?.type ?? inferBackendType(spec.profile), + profile: spec.profile satisfies AgentProfile, + ...(overrideBackend?.model ? { model: overrideBackend.model } : {}), + ...(overrideBackend?.server ? { server: overrideBackend.server } : {}), + }, + } + // Cooperative cancellation: if the abort signal fires while .create is + // pending, the promise itself is not abortable but the inflight prompt is. + if (signal.aborted) throwAbort() + return client.create(opts) +} + +function inferBackendType( + profile: AgentProfile, +): CreateSandboxOptions['backend'] extends infer B + ? B extends { type: infer T } + ? T + : never + : never { + // The sandbox SDK accepts profile-driven backend selection by name. When the + // profile has no explicit hint we fall through to the SDK's default + // ('opencode' on the platform side). Returning a literal here would lie + // about provenance — let the SDK pick. + type BackendType = NonNullable['type'] + const explicit = profile.metadata?.backendType + if (typeof explicit === 'string') return explicit as BackendType + return 'opencode' as BackendType +} + +interface FinalizeArgs { + options: RunLoopOptions + decision: Decision + iterations: Iteration[] + startMs: number + now: () => number + runId: string +} + +function finalize( + args: FinalizeArgs, +): LoopResult { + const winner = (args.options.selectWinner ?? defaultSelectWinner)(args.iterations) + const costUsd = args.iterations.reduce((sum, iter) => sum + (iter.costUsd || 0), 0) + const result: LoopResult = { + decision: args.decision, + iterations: args.iterations, + winner, + durationMs: args.now() - args.startMs, + costUsd, + } + void emitTrace(args.options.ctx.traceEmitter, { + kind: 'loop.ended', + runId: args.runId, + timestamp: args.now(), + payload: { + winnerIterationIndex: winner?.iterationIndex, + totalCostUsd: costUsd, + durationMs: result.durationMs, + iterations: args.iterations.length, + }, + }) + return result +} + +function defaultSelectWinner( + iterations: Iteration[], +): LoopWinner | undefined { + const candidates = iterations.filter((iter) => iter.output !== undefined && !iter.error) + if (candidates.length === 0) return undefined + const valid = candidates.filter((iter) => iter.verdict?.valid === true) + const pool = valid.length > 0 ? valid : candidates + const sorted = [...pool].sort( + (a, b) => (b.verdict?.score ?? 0) - (a.verdict?.score ?? 0) || a.index - b.index, + ) + const top = sorted[0] + if (!top || top.output === undefined) return undefined + return { + task: top.task, + output: top.output, + verdict: top.verdict, + iterationIndex: top.index, + agentRunName: top.agentRunName, + } +} + +function resolveAgentRuns( + options: RunLoopOptions, +): AgentRunSpec[] { + if (options.agentRun && options.agentRuns) { + throw new ValidationError('runLoop: pass exactly one of `agentRun` or `agentRuns`') + } + if (options.agentRun) return [options.agentRun] + if (options.agentRuns && options.agentRuns.length > 0) return options.agentRuns + throw new ValidationError('runLoop: `agentRun` or non-empty `agentRuns` is required') +} + +function isTerminalDecision(decision: unknown): boolean { + return ( + decision === 'stop' || decision === 'pick-winner' || decision === 'fail' || decision === 'done' + ) +} + +function serializeDecision(decision: unknown): string { + if (typeof decision === 'string') return decision + if (decision === null || decision === undefined) return 'null' + try { + return JSON.stringify(decision) + } catch { + return String(decision) + } +} + +async function emitTrace( + emitter: LoopTraceEmitter | undefined, + event: LoopTraceEvent, +): Promise { + if (!emitter) return + await emitter.emit(event) +} + +function randomSuffix(len = 8): string { + return Math.random() + .toString(36) + .slice(2, 2 + len) +} + +function throwAbort(): never { + const err = new Error('aborted') + err.name = 'AbortError' + throw err +} + +/** + * Extract a `RuntimeStreamEvent`-shaped `llm_call` from a sandbox event when + * the event carries usage/cost data. Returns `undefined` for non-cost events + * so the kernel can iterate the full stream without branching. + * + * Sandbox SDK emits a polymorphic `SandboxEvent = { type, data, id? }`. The + * canonical cost-carrying types observed in the wild: + * - `llm_call` — `data: { model, tokensIn, tokensOut, costUsd, ... }` + * - `message.completed` / `result` — `data: { usage: { inputTokens, + * outputTokens, totalCostUsd? } }` + * - `cost.usage` — same shape under a dedicated type + * + * Numeric coercion is strict: `Number.isFinite` gates every accumulator + * write so a sentinel `NaN` from a misbehaving backend cannot poison the + * ledger. + */ +function extractLlmCallEvent( + event: SandboxEvent, + agentRunName: string, +): (RuntimeStreamEvent & { type: 'llm_call' }) | undefined { + if (!event || typeof event !== 'object') return undefined + const type = String(event.type ?? '') + const data = + event.data && typeof event.data === 'object' + ? (event.data as Record) + : ({} as Record) + + if (type === 'llm_call' || type === 'cost.usage' || type === 'usage') { + return buildLlmCall(data, agentRunName) + } + if (type === 'message.completed' || type === 'result' || type === 'final') { + const usage = data.usage as Record | undefined + if (!usage || typeof usage !== 'object') return undefined + return buildLlmCall({ ...usage, model: data.model ?? usage.model }, agentRunName) + } + return undefined +} + +function buildLlmCall( + data: Record, + agentRunName: string, +): (RuntimeStreamEvent & { type: 'llm_call' }) | undefined { + const tokensIn = pickFiniteNumber(data, ['tokensIn', 'inputTokens', 'prompt_tokens']) + const tokensOut = pickFiniteNumber(data, ['tokensOut', 'outputTokens', 'completion_tokens']) + const costUsd = pickFiniteNumber(data, ['costUsd', 'totalCostUsd', 'cost_usd', 'cost']) + if (tokensIn === undefined && tokensOut === undefined && costUsd === undefined) { + return undefined + } + const model = typeof data.model === 'string' && data.model.length > 0 ? data.model : agentRunName + const event: RuntimeStreamEvent & { type: 'llm_call' } = { + type: 'llm_call', + model, + } + if (tokensIn !== undefined) event.tokensIn = tokensIn + if (tokensOut !== undefined) event.tokensOut = tokensOut + if (costUsd !== undefined) event.costUsd = costUsd + return event +} + +function pickFiniteNumber(data: Record, keys: string[]): number | undefined { + for (const key of keys) { + const value = data[key] + if (typeof value === 'number' && Number.isFinite(value)) return value + } + return undefined +} + +/** + * Stable hash for the trace payload. Not cryptographic — only used so + * downstream eval pipelines can group iterations whose task / output is the + * same. Bare structural hash; non-JSON values stringify via their `toString`. + */ +function hashJson(value: unknown): string { + let str: string + try { + str = JSON.stringify(value) ?? String(value) + } catch { + str = String(value) + } + // FNV-1a 32-bit — branch-free, dependency-free, good enough for grouping. + let h = 0x811c9dc5 + for (let i = 0; i < str.length; i += 1) { + h ^= str.charCodeAt(i) + h = Math.imul(h, 0x01000193) + } + return (h >>> 0).toString(16).padStart(8, '0') +} diff --git a/src/loops/trace.ts b/src/loops/trace.ts new file mode 100644 index 0000000..8a537fb --- /dev/null +++ b/src/loops/trace.ts @@ -0,0 +1,22 @@ +/** + * @experimental + * + * Loop-topology trace events. Independent from `runHandle.observe`, which + * tracks cost. These describe the loop's iteration tree so downstream eval + * pipelines can group traces by topology (refine vs fanout, which spec ran + * each iteration, who won). + * + * Re-exported from `./types` for back-compat with the kernel's local imports; + * the canonical home is `./types` so call sites that already import from + * `loops` don't double-import. + */ + +export type { + LoopDecisionPayload, + LoopEndedPayload, + LoopIterationEndedPayload, + LoopIterationStartedPayload, + LoopStartedPayload, + LoopTraceEmitter, + LoopTraceEvent, +} from './types' diff --git a/src/loops/types.ts b/src/loops/types.ts new file mode 100644 index 0000000..100bd34 --- /dev/null +++ b/src/loops/types.ts @@ -0,0 +1,235 @@ +/** + * @experimental + * + * Driven-loop substrate — type surface. + * + * The loop kernel orchestrates around the sandbox SDK; it does not invent + * its own notion of "what an agent is". Each iteration is a sandbox-SDK + * `streamPrompt` call against an `AgentProfile`. The kernel owns iteration + * accounting, concurrency, abort propagation, cost aggregation, and trace + * emission; the driver owns topology (plan + decide); the validator owns + * output scoring; the output adapter owns event-stream → typed-output decode. + */ + +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import type { RuntimeRunHandle } from '../runtime-run' + +/** @experimental */ +export interface DefaultVerdict { + /** Whether the output meets the validator's pass criteria. */ + valid: boolean + /** Aggregate score in [0, 1]. Drivers use this for winner selection. */ + score: number + /** Per-dimension scores. Free-form; weighted into `score` by the validator. */ + scores?: Record + /** Human-readable rationale; surfaces in trace + final-result `winner.verdict`. */ + notes?: string +} + +/** @experimental */ +export interface ValidationCtx { + /** Iteration index this output came from (0-based). */ + iteration: number + /** Cooperative cancellation channel. */ + signal: AbortSignal +} + +/** @experimental */ +export interface Validator { + validate(output: Output, ctx: ValidationCtx): Promise +} + +/** + * Sandbox-SDK-shaped agent specification. + * + * The kernel uses `profile` to instantiate a sandbox per iteration, formats + * `task` into a prompt via `taskToPrompt`, and merges `sandboxOverrides` into + * the `CreateSandboxOptions` it passes to `client.create`. Heterogeneous + * fanout supplies multiple `AgentRunSpec`s and the kernel round-robins + * through them when the driver plans N tasks. + * + * @experimental + */ +export interface AgentRunSpec { + /** Sandbox SDK profile — what kind of agent runs the task. */ + profile: AgentProfile + /** Task → prompt formatter. Pure and deterministic. */ + taskToPrompt: (task: Task) => string + /** + * Per-spec stable name. Surfaced in trace events and the default winner + * selector tiebreak. Falls back to `profile.name ?? 'agent'`. + */ + name?: string + /** + * Optional sandbox-SDK `CreateSandboxOptions` overrides merged on top of + * the kernel's defaults. `backend.profile` is set to `profile` by the + * kernel and cannot be overridden here — use `profile` itself for that. + */ + sandboxOverrides?: Partial> & { + backend?: Omit, 'profile'> + } +} + +/** + * Stream of `SandboxEvent`s → typed `Output`. + * + * Adapters are pure functions over the already-collected event array; they + * do not receive the live AsyncIterable so they can be replayed against + * persisted streams during tests / replays. + * + * @experimental + */ +export interface OutputAdapter { + parse(events: SandboxEvent[]): Output +} + +/** @experimental */ +export interface Iteration { + /** 0-based iteration index assigned by the kernel. */ + index: number + task: Task + /** Stable name of the `AgentRunSpec` that produced this iteration. */ + agentRunName: string + output?: Output + verdict?: DefaultVerdict + error?: Error + /** Raw sandbox event stream collected for this iteration. */ + events: SandboxEvent[] + startedAt: number + endedAt: number + costUsd: number +} + +/** @experimental */ +export interface Driver { + /** + * Stable identifier surfaced in trace events. Default `'driver'`. + */ + readonly name?: string + /** + * Tasks to issue this iteration. `[task]` → refine; N copies → fanout; + * `[]` → no more work this round (kernel proceeds to `decide`). + */ + plan(task: Task, history: ReadonlyArray>): Promise + /** + * Inspect history and return the next state. The kernel terminates the + * loop when `decide` returns a value listed in `isTerminalDecision` + * (`'stop' | 'pick-winner' | 'fail' | 'done'`), when `maxIterations` + * is hit, or when the abort signal fires. + */ + decide(history: ReadonlyArray>): Decision | Promise +} + +/** @experimental */ +export interface LoopWinner { + task: Task + output: Output + verdict?: DefaultVerdict + iterationIndex: number + agentRunName: string +} + +/** @experimental */ +export interface LoopResult { + decision: Decision + iterations: Iteration[] + winner?: LoopWinner + durationMs: number + /** Sum of every iteration's `costUsd`. */ + costUsd: number +} + +/** + * Minimal sandbox client surface the kernel calls. Satisfied structurally by + * `new Sandbox({ apiKey, baseUrl })` — declared as a structural type so + * tests can pass a stub without instantiating the SDK. + * + * @experimental + */ +export interface LoopSandboxClient { + create(options?: CreateSandboxOptions): Promise +} + +/** @experimental */ +export interface LoopTraceEmitter { + emit(event: LoopTraceEvent): void | Promise +} + +/** @experimental */ +export type LoopTraceEvent = + | { kind: 'loop.started'; runId: string; timestamp: number; payload: LoopStartedPayload } + | { + kind: 'loop.iteration.started' + runId: string + timestamp: number + payload: LoopIterationStartedPayload + } + | { + kind: 'loop.iteration.ended' + runId: string + timestamp: number + payload: LoopIterationEndedPayload + } + | { kind: 'loop.decision'; runId: string; timestamp: number; payload: LoopDecisionPayload } + | { kind: 'loop.ended'; runId: string; timestamp: number; payload: LoopEndedPayload } + +/** @experimental */ +export interface LoopStartedPayload { + driver: string + agentRunNames: string[] + maxIterations: number + maxConcurrency: number +} + +/** @experimental */ +export interface LoopIterationStartedPayload { + iterationIndex: number + agentRunName: string + taskHash: string +} + +/** @experimental */ +export interface LoopIterationEndedPayload { + iterationIndex: number + agentRunName: string + outputHash?: string + verdict?: DefaultVerdict + error?: string + costUsd: number + durationMs: number +} + +/** @experimental */ +export interface LoopDecisionPayload { + decision: string + historyLength: number +} + +/** @experimental */ +export interface LoopEndedPayload { + winnerIterationIndex?: number + totalCostUsd: number + durationMs: number + iterations: number +} + +/** @experimental */ +export interface ExecCtx { + /** Sandbox SDK client — the kernel calls `.create()` per iteration. */ + sandboxClient: LoopSandboxClient + /** Optional trace emitter. When set, the kernel emits `loop.*` events. */ + traceEmitter?: LoopTraceEmitter + /** + * Optional production-run handle. When set, every synthesized `llm_call` + * the kernel infers from a sandbox event stream is forwarded via + * `runHandle.observe` so per-run cost aggregates pick up loop spend. + */ + runHandle?: RuntimeRunHandle + /** Cooperative cancellation signal. */ + signal?: AbortSignal +} diff --git a/src/profiles/coder.ts b/src/profiles/coder.ts new file mode 100644 index 0000000..80149d3 --- /dev/null +++ b/src/profiles/coder.ts @@ -0,0 +1,398 @@ +/** + * @experimental + * + * `coderProfile` — opinionated preset for code-modification tasks. + * + * The agent is told to: + * - work on a fresh branch inside the sandbox workspace + * - keep the patch minimal (under `maxDiffLines`) + * - avoid `forbiddenPaths` + * - run `testCmd` and `typecheckCmd` + * - emit a final JSON result the output adapter parses + * + * The profile is stateless and agent-agnostic — `harness` selects the + * sandbox-SDK backend (`claude-code`, `codex`, `opencode/*`). For + * heterogeneous fanout, use `multiHarnessCoderFanout`. + */ + +import type { AgentProfile, SandboxEvent } from '@tangle-network/sandbox' +import { createFanoutVoteDriver } from '../loops/drivers/fanout-vote' +import type { AgentRunSpec, DefaultVerdict, Driver, OutputAdapter, Validator } from '../loops/types' + +const DEFAULT_MAX_DIFF_LINES = 400 + +/** @experimental */ +export interface CoderTask { + /** What the agent must accomplish. Free-form prose. */ + goal: string + /** Absolute path inside the sandbox where the repo lives. */ + repoRoot: string + /** Default `main`. The branch the agent diffs against. */ + baseBranch?: string + /** Default `pnpm test --run`. */ + testCmd?: string + /** Default `pnpm typecheck`. */ + typecheckCmd?: string + /** Files the agent may inspect for context. Surfaced verbatim in the prompt. */ + contextFiles?: string[] + /** + * Paths the agent must not touch. Validator hard-fails on any match. + * Use glob-free literal path prefixes for unambiguous enforcement. + */ + forbiddenPaths?: string[] + /** Default 400. Hard cap; validator hard-fails when exceeded. */ + maxDiffLines?: number +} + +/** @experimental */ +export interface CoderOutput { + /** Branch the agent wrote the patch on. */ + branch: string + /** Unified diff (`git diff ..HEAD`). */ + patch: string + testResult: { passed: boolean; output: string } + typecheckResult: { passed: boolean; output: string } + diffStats: { filesChanged: number; insertions: number; deletions: number } + /** Optional reviewer commentary surfaced by the agent. */ + reviewerNotes?: string +} + +/** @experimental */ +export interface CoderProfileOptions { + /** Sandbox-SDK backend.type. Default `'claude-code'`. */ + harness?: string + /** Default model id passed in `AgentProfile.model.default`. */ + model?: string + /** Custom system prompt replacement. Default = built-in coder preset. */ + systemPrompt?: string + /** Stable name for `AgentRunSpec.name`. Default = `coder-${harness}`. */ + name?: string +} + +/** + * Build a coder preset. + * + * `validator` enforces test + typecheck + a 400-line default diff cap. For + * per-task `forbiddenPaths` / `maxDiffLines` enforcement, pass `task` here + * — the returned validator closes over its constraints. Without a task + * the validator falls back to the default cap and skips path enforcement. + * + * @experimental + */ +export function coderProfile(options: CoderProfileOptions & { task?: CoderTask } = {}): { + profile: AgentProfile + taskToPrompt: (task: CoderTask) => string + output: OutputAdapter + validator: Validator + agentRunSpec: AgentRunSpec +} { + const harness = options.harness ?? 'claude-code' + const name = options.name ?? `coder-${harness}` + const systemPrompt = options.systemPrompt ?? DEFAULT_CODER_SYSTEM_PROMPT + const profile: AgentProfile = { + name, + description: 'Code-modification agent. Minimal-diff worktree-based coder.', + prompt: { systemPrompt }, + model: options.model ? { default: options.model } : undefined, + tools: { git: true, fs: true, shell: true, test_runner: true }, + metadata: { backendType: harness, role: 'coder' }, + } + const output: OutputAdapter = { parse: parseCoderEvents } + const validator: Validator = options.task + ? createCoderValidator(options.task) + : createCoderValidator({ + goal: '', + repoRoot: '', + forbiddenPaths: [], + maxDiffLines: DEFAULT_MAX_DIFF_LINES, + }) + const agentRunSpec: AgentRunSpec = { + name, + profile, + taskToPrompt: formatCoderPrompt, + } + return { profile, taskToPrompt: formatCoderPrompt, output, validator, agentRunSpec } +} + +/** @experimental */ +export interface MultiHarnessCoderFanoutOptions { + /** + * Sandbox-SDK backend.type identifiers, one per parallel agent. Default: + * `['claude-code', 'codex', 'opencode/zai-coding-plan/glm-5.1']`. + */ + harnesses?: string[] + /** Optional per-harness model override. Indexed parallel to `harnesses`. */ + models?: (string | undefined)[] +} + +/** @experimental */ +export function multiHarnessCoderFanout(options: MultiHarnessCoderFanoutOptions = {}): { + agentRuns: AgentRunSpec[] + output: OutputAdapter + validator: Validator + driver: Driver +} { + const harnesses = + options.harnesses && options.harnesses.length > 0 + ? options.harnesses + : ['claude-code', 'codex', 'opencode/zai-coding-plan/glm-5.1'] + const models = options.models ?? [] + const agentRuns = harnesses.map((harness, i) => { + const { agentRunSpec } = coderProfile({ harness, model: models[i] }) + return agentRunSpec + }) + const { output, validator } = coderProfile() + const driver = createFanoutVoteDriver({ n: harnesses.length }) + return { agentRuns, output, validator, driver } +} + +const DEFAULT_CODER_SYSTEM_PROMPT = [ + 'You are a coder agent operating inside an isolated sandbox workspace.', + 'Your job is to deliver a minimal, correct patch for the user-supplied goal.', + '', + 'Hard rules:', + ' 1. Work on a fresh branch off the supplied base. Do not mutate the base branch.', + ' 2. Never touch a forbidden path. The user will list them explicitly.', + ' 3. Keep the diff under the max-diff cap. Prefer the smallest change that ships.', + ' 4. Run the supplied test and typecheck commands before declaring done.', + ' 5. If either command fails, fix the cause — do not weaken the test or hide the error.', + '', + 'When you finish, emit a single final structured message of the shape:', + ' ```json', + ' { "branch": "",', + ' "patch": "",', + ' "testResult": { "passed": , "output": "" },', + ' "typecheckResult": { "passed": , "output": "" },', + ' "diffStats": { "filesChanged": , "insertions": , "deletions": },', + ' "reviewerNotes": "" }', + ' ```', +].join('\n') + +function formatCoderPrompt(task: CoderTask): string { + const base = task.baseBranch ?? 'main' + const testCmd = task.testCmd ?? 'pnpm test --run' + const typecheckCmd = task.typecheckCmd ?? 'pnpm typecheck' + const maxDiff = task.maxDiffLines ?? DEFAULT_MAX_DIFF_LINES + const forbidden = task.forbiddenPaths?.length ? task.forbiddenPaths.join(', ') : '(none)' + const context = task.contextFiles?.length + ? task.contextFiles.map((f) => ` - ${f}`).join('\n') + : ' (none)' + return [ + `Goal: ${task.goal}`, + `Repo: ${task.repoRoot}`, + `Base branch: ${base}`, + `Run tests with: ${testCmd}`, + `Run typecheck with: ${typecheckCmd}`, + `Forbidden paths: ${forbidden}`, + `Max diff lines: ${maxDiff}`, + 'Context files:', + context, + '', + 'Produce a minimal patch on a fresh branch. Run tests and typecheck before', + 'returning. Emit the final JSON result block exactly as instructed.', + ].join('\n') +} + +/** + * Walk the event stream and return the last structured `coder.result` payload. + * + * The agent is instructed to emit a JSON block; in practice the sandbox SDK + * lifts the structured payload onto `data.result` of a `result` / `final` + * event. When the event stream does not contain a structured result, the + * adapter scans text deltas for a fenced JSON block matching the expected + * keys. Both shapes converge on `CoderOutput`. + */ +function parseCoderEvents(events: SandboxEvent[]): CoderOutput { + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const type = String(event.type ?? '') + const data = isRecord(event.data) ? event.data : {} + if (type === 'result' || type === 'final' || type === 'coder.result') { + const direct = coerceCoderOutput(data.result ?? data.output ?? data) + if (direct) return direct + } + } + // Fallback: scan text deltas in reverse for a fenced JSON block. + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const data = isRecord(event.data) ? event.data : {} + const text = pickString(data.text) ?? pickString(data.delta) + if (!text) continue + const fenced = extractFencedJson(text) + if (!fenced) continue + const coerced = coerceCoderOutput(fenced) + if (coerced) return coerced + } + return { + branch: '', + patch: '', + testResult: { passed: false, output: '' }, + typecheckResult: { passed: false, output: '' }, + diffStats: { filesChanged: 0, insertions: 0, deletions: 0 }, + } +} + +/** + * Build a validator that closes over a specific `CoderTask`'s constraints. + * + * Checks in order: + * 1. Forbidden-path: any `+++` / `---` header in the patch matching a + * path prefix in `task.forbiddenPaths` fails hard. + * 2. Diff size: line count above `task.maxDiffLines` (default 400) fails + * hard; below cap, the score shrinks linearly. + * 3. Tests: `output.testResult.passed` must be `true`. + * 4. Typecheck: `output.typecheckResult.passed` must be `true`. + * + * Aggregate score: `0.5 * tests + 0.3 * typecheck + 0.2 * (1 - diffLines/maxDiff)`. + * `valid` is the conjunction of all four. + * + * @experimental + */ +export function createCoderValidator(task: CoderTask): Validator { + const maxDiff = task.maxDiffLines ?? DEFAULT_MAX_DIFF_LINES + const forbidden = task.forbiddenPaths ?? [] + return { + async validate(output) { + const scores: Record = {} + const notes: string[] = [] + let pass = true + + const touched = touchedPathsFromPatch(output.patch) + const touchedForbidden = forbidden.filter((path) => { + const prefix = path.endsWith('/') ? path : `${path}/` + const exact = prefix.slice(0, -1) + return touched.some((p) => p === exact || p.startsWith(prefix)) + }) + if (touchedForbidden.length > 0) { + pass = false + scores.forbiddenPath = 0 + notes.push(`touched forbidden paths: ${touchedForbidden.join(', ')}`) + } else { + scores.forbiddenPath = 1 + } + + const diffLines = countDiffLines(output.patch) + if (diffLines > maxDiff) { + pass = false + scores.diffSize = 0 + notes.push(`diff ${diffLines} lines exceeds cap ${maxDiff}`) + } else { + scores.diffSize = maxDiff === 0 ? 0 : Math.max(0, 1 - diffLines / maxDiff) + } + + scores.tests = output.testResult.passed ? 1 : 0 + scores.typecheck = output.typecheckResult.passed ? 1 : 0 + if (!output.testResult.passed) { + pass = false + notes.push('tests failed') + } + if (!output.typecheckResult.passed) { + pass = false + notes.push('typecheck failed') + } + + const score = 0.5 * scores.tests + 0.3 * scores.typecheck + 0.2 * scores.diffSize + const verdict: DefaultVerdict = { + valid: pass, + score: Number.isFinite(score) ? score : 0, + scores, + } + if (notes.length > 0) verdict.notes = notes.join('; ') + return verdict + }, + } +} + +function touchedPathsFromPatch(patch: string): string[] { + const out = new Set() + for (const line of patch.split(/\r?\n/)) { + if (line.startsWith('+++ ') || line.startsWith('--- ')) { + const rest = line.slice(4).trim() + if (rest === '/dev/null') continue + const stripped = rest.startsWith('a/') || rest.startsWith('b/') ? rest.slice(2) : rest + out.add(stripped) + } + } + return [...out] +} + +function countDiffLines(patch: string): number { + let count = 0 + for (const line of patch.split(/\r?\n/)) { + if ( + (line.startsWith('+') || line.startsWith('-')) && + !line.startsWith('+++') && + !line.startsWith('---') + ) { + count += 1 + } + } + return count +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value) +} + +function pickString(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function extractFencedJson(text: string): unknown | undefined { + const match = text.match(/```(?:json)?\s*([\s\S]*?)```/i) + if (!match) return undefined + const body = (match[1] ?? '').trim() + if (!body) return undefined + try { + return JSON.parse(body) + } catch { + return undefined + } +} + +function coerceCoderOutput(value: unknown): CoderOutput | undefined { + if (!isRecord(value)) return undefined + const branch = pickString(value.branch) + const patch = pickString(value.patch) ?? '' + if (branch === undefined) return undefined + const testResult = coerceCmdResult(value.testResult) + const typecheckResult = coerceCmdResult(value.typecheckResult) + const diffStats = coerceDiffStats(value.diffStats) + return { + branch, + patch, + testResult, + typecheckResult, + diffStats, + reviewerNotes: pickString(value.reviewerNotes), + } +} + +function coerceCmdResult(value: unknown): { passed: boolean; output: string } { + if (!isRecord(value)) return { passed: false, output: '' } + return { + passed: value.passed === true, + output: pickString(value.output) ?? '', + } +} + +function coerceDiffStats(value: unknown): { + filesChanged: number + insertions: number + deletions: number +} { + if (!isRecord(value)) return { filesChanged: 0, insertions: 0, deletions: 0 } + return { + filesChanged: toFiniteInt(value.filesChanged), + insertions: toFiniteInt(value.insertions), + deletions: toFiniteInt(value.deletions), + } +} + +function toFiniteInt(value: unknown): number { + if (typeof value !== 'number') return 0 + if (!Number.isFinite(value)) return 0 + return Math.max(0, Math.trunc(value)) +} diff --git a/src/profiles/index.ts b/src/profiles/index.ts new file mode 100644 index 0000000..37a5b06 --- /dev/null +++ b/src/profiles/index.ts @@ -0,0 +1,16 @@ +/** + * @experimental + * + * Pre-built `AgentRunSpec` + output adapter + validator bundles for common + * agent roles. Each preset bundles a sandbox-SDK `AgentProfile`, a + * task-to-prompt formatter, an output adapter, and a per-task validator + * constructor — all of the pieces `runLoop` needs to drive a topology. + */ + +export type { + CoderOutput, + CoderProfileOptions, + CoderTask, + MultiHarnessCoderFanoutOptions, +} from './coder' +export { coderProfile, createCoderValidator, multiHarnessCoderFanout } from './coder' diff --git a/tests/loops/composition.test.ts b/tests/loops/composition.test.ts new file mode 100644 index 0000000..3a97ea2 --- /dev/null +++ b/tests/loops/composition.test.ts @@ -0,0 +1,179 @@ +import type { AgentProfile, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + type AgentRunSpec, + createFanoutVoteDriver, + createRefineDriver, + type Driver, + type OutputAdapter, + runLoop, + type Validator, +} from '../../src/loops' + +interface Task { + goal: string +} + +interface Inner { + attempt: number +} + +interface Outer { + best: number +} + +const profile: AgentProfile = { name: 'compose-stub' } + +const innerOutput: OutputAdapter = { + parse(events) { + const last = events.at(-1) + const data = last?.data as { attempt?: number } | undefined + return { attempt: typeof data?.attempt === 'number' ? data.attempt : 0 } + }, +} + +const innerValidator: Validator = { + async validate(out) { + return { valid: out.attempt >= 2, score: out.attempt / 3 } + }, +} + +const outerValidator: Validator = { + async validate(out) { + return { valid: out.best >= 2, score: out.best } + }, +} + +const innerSpec: AgentRunSpec = { + profile, + name: 'inner', + taskToPrompt: (t) => t.goal, +} + +function counterClient() { + let i = 0 + return { + async create() { + const attempt = ++i + return { + async *streamPrompt() { + yield { type: 'result', data: { attempt } } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } +} + +describe('runLoop composition — a Driver that nests runLoop inside plan()', () => { + it('wraps an inner refine loop and the outer driver gates on the inner winner', async () => { + const innerClient = counterClient() + + // Outer driver: each iteration's plan kicks off a full inner refine loop + // and yields a *single* outer task whose output adapter just stamps the + // inner best score. The outer task never reaches the sandbox client + // because we hand the kernel a no-op spec that immediately yields a + // synthetic result mirroring the inner winner. + let innerBest = 0 + const outerDriver: Driver = { + name: 'outer', + async plan(task, history) { + if (history.length >= 2) return [] + const innerResult = await runLoop({ + driver: createRefineDriver(), + agentRun: innerSpec, + output: innerOutput, + validator: innerValidator, + task, + ctx: { sandboxClient: innerClient }, + }) + innerBest = innerResult.winner?.verdict?.score ?? 0 + return [task] + }, + decide(history) { + const last = history.at(-1) + if (last?.verdict?.valid) return 'stop' + if (history.length >= 2) return 'stop' + return 'continue' + }, + } + + let outerCalls = 0 + const outerClient = { + async create() { + outerCalls += 1 + return { + async *streamPrompt() { + yield { type: 'result', data: { best: innerBest } } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + + const outerOutput: OutputAdapter = { + parse(events) { + const last = events.at(-1) + const data = last?.data as { best?: number } | undefined + return { best: typeof data?.best === 'number' ? data.best : 0 } + }, + } + + const result = await runLoop({ + driver: outerDriver, + agentRun: { + profile, + name: 'outer-agent', + taskToPrompt: (t) => `outer:${t.goal}`, + }, + output: outerOutput, + validator: outerValidator, + task: { goal: 'compose' }, + ctx: { sandboxClient: outerClient }, + }) + + expect(outerCalls).toBeGreaterThan(0) + // The inner refine produces attempt=2 on the second iteration (score=2/3), + // which fails outerValidator's `best >= 2` check, so the loop exhausts + // the outer cap and stops without winning — but the structure shows the + // nesting works end-to-end. + expect(result.iterations.length).toBeGreaterThan(0) + expect(result.decision).toBe('stop') + }) + + it('static type check: a driver may compose multiple runLoops sequentially', () => { + // Compile-time proof that nested runLoop calls return well-typed results. + // The body is intentionally unreachable; the assertion is the type + // signature itself. + async function _typecheckOnly() { + const r1 = await runLoop({ + driver: createRefineDriver(), + agentRun: innerSpec, + output: innerOutput, + validator: innerValidator, + task: { goal: '' }, + ctx: { + sandboxClient: { + async create() { + throw new Error() + }, + }, + }, + }) + const r2 = await runLoop({ + driver: createFanoutVoteDriver({ n: 2 }), + agentRun: innerSpec, + output: innerOutput, + validator: innerValidator, + task: { goal: '' }, + ctx: { + sandboxClient: { + async create() { + throw new Error() + }, + }, + }, + }) + return { r1, r2 } + } + expect(typeof _typecheckOnly).toBe('function') + }) +}) diff --git a/tests/loops/fanout-vote.test.ts b/tests/loops/fanout-vote.test.ts new file mode 100644 index 0000000..288bb99 --- /dev/null +++ b/tests/loops/fanout-vote.test.ts @@ -0,0 +1,281 @@ +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + type AgentRunSpec, + createFanoutVoteDriver, + type OutputAdapter, + runLoop, + scoreFanoutVoteIterations, + type Validator, +} from '../../src/loops' + +interface FanTask { + goal: string +} + +interface FanOutput { + variantId: string + score: number +} + +const output: OutputAdapter = { + parse(events) { + const last = events.at(-1) + const data = last?.data as { variantId?: string; score?: number } | undefined + return { + variantId: data?.variantId ?? '', + score: typeof data?.score === 'number' ? data.score : 0, + } + }, +} + +const validator: Validator = { + async validate(out) { + return { valid: out.score > 0.5, score: out.score } + }, +} + +function profile(name: string): AgentProfile { + return { name } +} + +function specs(names: string[]): AgentRunSpec[] { + return names.map((name) => ({ + profile: profile(name), + name, + taskToPrompt: (t) => t.goal, + })) +} + +function deterministicClient(outputs: Array<{ variantId: string; score: number }>): { + client: { create(opts?: CreateSandboxOptions): Promise } + observed: { creates: number; concurrentMax: number } +} { + const state = { creates: 0, concurrentMax: 0, inflight: 0 } + const pending: Array<() => void> = [] + return { + observed: state as { creates: number; concurrentMax: number }, + client: { + async create() { + const i = state.creates + state.creates += 1 + const variant = outputs[i] ?? { variantId: `unknown-${i}`, score: 0 } + const release = new Promise((resolve) => pending.push(resolve)) + const box = { + async *streamPrompt() { + state.inflight += 1 + state.concurrentMax = Math.max(state.concurrentMax, state.inflight) + // Yield to the scheduler so all sandboxes start in parallel. + await new Promise((r) => setTimeout(r, 0)) + // Release the next pending sandbox so concurrency can climb. + const next = pending.shift() + if (next) next() + await release + state.inflight -= 1 + yield { + type: 'result', + data: { variantId: variant.variantId, score: variant.score }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + // First sandbox releases the pump. + if (i === 0) { + setTimeout(() => { + const next = pending.shift() + if (next) next() + }, 0) + } + return box + }, + }, + } +} + +describe('runLoop + createFanoutVoteDriver', () => { + it('spawns N parallel attempts and selects the highest-scoring valid winner', async () => { + const outputs = [ + { variantId: 'a', score: 0.3 }, + { variantId: 'b', score: 0.9 }, + { variantId: 'c', score: 0.7 }, + ] + let createdCount = 0 + const client = { + async create() { + const i = createdCount + createdCount += 1 + const variant = outputs[i]! + return { + async *streamPrompt() { + yield { + type: 'result', + data: { variantId: variant.variantId, score: variant.score }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + + const result = await runLoop({ + driver: createFanoutVoteDriver({ n: 3 }), + agentRun: { + profile: profile('uniform'), + name: 'uniform', + taskToPrompt: (t) => t.goal, + }, + output, + validator, + task: { goal: 'fanout' }, + ctx: { sandboxClient: client }, + }) + + expect(result.iterations).toHaveLength(3) + expect(result.decision).toBe('pick-winner') + expect(result.winner?.output.variantId).toBe('b') + expect(result.winner?.verdict?.score).toBeCloseTo(0.9, 6) + }) + + it('resolves to fail when no iteration produces a valid output', async () => { + let createdCount = 0 + const client = { + async create() { + const i = createdCount + createdCount += 1 + return { + async *streamPrompt() { + yield { + type: 'result', + data: { variantId: `v${i}`, score: 0.1 }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + + const result = await runLoop({ + driver: createFanoutVoteDriver({ n: 2 }), + agentRun: { + profile: profile('weak'), + name: 'weak', + taskToPrompt: (t) => t.goal, + }, + output, + validator, + task: { goal: 'fail-fanout' }, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('fail') + expect(result.iterations).toHaveLength(2) + }) + + it('respects maxConcurrency cap on parallel fanout', async () => { + const { client, observed } = deterministicClient( + Array.from({ length: 4 }, (_, i) => ({ variantId: `v${i}`, score: 0.6 })), + ) + await runLoop({ + driver: createFanoutVoteDriver({ n: 4 }), + agentRun: { + profile: profile('uniform'), + name: 'uniform', + taskToPrompt: (t) => t.goal, + }, + output, + validator, + task: { goal: 'cap' }, + ctx: { sandboxClient: client }, + maxConcurrency: 2, + }) + + expect(observed.creates).toBe(4) + expect(observed.concurrentMax).toBeLessThanOrEqual(2) + }) + + it('rotates through heterogeneous agentRuns for diversity', async () => { + const used: string[] = [] + let createdCount = 0 + const client = { + async create(opts?: CreateSandboxOptions) { + const name = + (opts?.backend?.profile && typeof opts.backend.profile === 'object' + ? opts.backend.profile.name + : undefined) ?? 'unknown' + used.push(name) + const i = createdCount + createdCount += 1 + return { + async *streamPrompt() { + yield { + type: 'result', + data: { variantId: `${name}-${i}`, score: 0.9 }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + } + + const result = await runLoop({ + driver: createFanoutVoteDriver({ n: 3 }), + agentRuns: specs(['alpha', 'beta', 'gamma']), + output, + validator, + task: { goal: 'diversity' }, + ctx: { sandboxClient: client }, + }) + + expect(used).toEqual(['alpha', 'beta', 'gamma']) + expect(result.iterations.map((i) => i.agentRunName)).toEqual(['alpha', 'beta', 'gamma']) + }) + + it('scoreFanoutVoteIterations surfaces the per-iteration view', () => { + const scored = scoreFanoutVoteIterations([ + { + index: 0, + task: { goal: '' }, + agentRunName: 'a', + events: [], + startedAt: 0, + endedAt: 0, + costUsd: 0, + output: { variantId: 'x', score: 0.5 }, + verdict: { valid: true, score: 0.5 }, + }, + { + index: 1, + task: { goal: '' }, + agentRunName: 'b', + events: [], + startedAt: 0, + endedAt: 0, + costUsd: 0, + error: new Error('boom'), + }, + ]) + expect(scored).toHaveLength(1) + expect(scored[0]?.iterationIndex).toBe(0) + }) + + it('rejects mismatched options (agentRun + agentRuns)', async () => { + await expect( + runLoop({ + driver: createFanoutVoteDriver({ n: 1 }), + agentRun: specs(['a'])[0], + agentRuns: specs(['a', 'b']), + output, + validator, + task: { goal: 'bad' }, + ctx: { + sandboxClient: { + async create() { + throw new Error('unreachable') + }, + }, + }, + }), + ).rejects.toThrow(/exactly one of/i) + }) +}) diff --git a/tests/loops/refine.test.ts b/tests/loops/refine.test.ts new file mode 100644 index 0000000..50121de --- /dev/null +++ b/tests/loops/refine.test.ts @@ -0,0 +1,283 @@ +import type { + AgentProfile, + CreateSandboxOptions, + SandboxEvent, + SandboxInstance, +} from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + type AgentRunSpec, + createRefineDriver, + type LoopTraceEvent, + type OutputAdapter, + refineWinnerIndex, + runLoop, + type Validator, +} from '../../src/loops' + +interface RefineTask { + goal: string +} + +interface RefineOutput { + attempt: number +} + +const profile: AgentProfile = { name: 'stub' } + +function spec(): AgentRunSpec { + return { + profile, + name: 'refiner', + taskToPrompt: (task) => task.goal, + } +} + +function stubClient(eventsPerCall: SandboxEvent[][]): { + client: { create(opts?: CreateSandboxOptions): Promise } + creates: number + prompts: string[] +} { + const state = { creates: 0, prompts: [] as string[] } + let callIndex = 0 + return { + creates: state.creates, + prompts: state.prompts, + client: { + async create() { + state.creates += 1 + const events = eventsPerCall[callIndex] ?? [] + callIndex += 1 + const box = { + async *streamPrompt(message: string) { + state.prompts.push(message) + for (const e of events) yield e + }, + } as unknown as SandboxInstance + return box + }, + }, + } +} + +const output: OutputAdapter = { + parse: (events) => { + const last = events.at(-1) + const data = last?.data as { attempt?: number } | undefined + return { attempt: typeof data?.attempt === 'number' ? data.attempt : -1 } + }, +} + +const passOnSecond: Validator = { + async validate(out) { + if (out.attempt >= 2) return { valid: true, score: 1, scores: { attempt: 1 } } + return { valid: false, score: 0, scores: { attempt: 0 }, notes: 'try again' } + }, +} + +describe('runLoop + createRefineDriver', () => { + it('iterates until the validator returns valid=true', async () => { + const stub = stubClient([ + [{ type: 'result', data: { attempt: 1 } }], + [{ type: 'result', data: { attempt: 2 } }], + ]) + const result = await runLoop({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passOnSecond, + task: { goal: 'fix it' }, + ctx: { sandboxClient: stub.client }, + }) + + expect(result.decision).toBe('stop') + expect(result.iterations).toHaveLength(2) + expect(result.iterations[0]?.verdict?.valid).toBe(false) + expect(result.iterations[1]?.verdict?.valid).toBe(true) + expect(result.winner?.iterationIndex).toBe(1) + expect(result.winner?.output).toEqual({ attempt: 2 }) + expect(stub.client).toBeDefined() + }) + + it('respects the driver-local maxIterations cap and reports stop', async () => { + const events: SandboxEvent[][] = Array.from({ length: 6 }, (_, i) => [ + { type: 'result', data: { attempt: i } }, + ]) + const failing: Validator = { + async validate() { + return { valid: false, score: 0, scores: { attempt: 0 } } + }, + } + const stub = stubClient(events) + const result = await runLoop({ + driver: createRefineDriver({ maxIterations: 3 }), + agentRun: spec(), + output, + validator: failing, + task: { goal: 'never passes' }, + ctx: { sandboxClient: stub.client }, + }) + + expect(result.iterations).toHaveLength(3) + expect(result.decision).toBe('stop') + expect(result.winner?.iterationIndex).toBeDefined() + }) + + it('respects the kernel maxIterations cap and re-asks the driver for a final decision', async () => { + const events: SandboxEvent[][] = Array.from({ length: 4 }, () => [ + { type: 'result', data: { attempt: 0 } }, + ]) + const failing: Validator = { + async validate() { + return { valid: false, score: 0, scores: { attempt: 0 } } + }, + } + const stub = stubClient(events) + const result = await runLoop({ + driver: createRefineDriver({ maxIterations: 10 }), + agentRun: spec(), + output, + validator: failing, + task: { goal: 'never passes' }, + ctx: { sandboxClient: stub.client }, + maxIterations: 2, + }) + expect(result.iterations).toHaveLength(2) + }) + + it('emits trace events in canonical order', async () => { + const events: LoopTraceEvent[] = [] + const stub = stubClient([ + [{ type: 'result', data: { attempt: 1 } }], + [{ type: 'result', data: { attempt: 2 } }], + ]) + await runLoop({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passOnSecond, + task: { goal: 'trace order' }, + ctx: { + sandboxClient: stub.client, + traceEmitter: { emit: (e) => void events.push(e) }, + }, + runId: 'fixed-run-id', + }) + + const kinds = events.map((e) => e.kind) + expect(kinds[0]).toBe('loop.started') + expect(kinds[kinds.length - 1]).toBe('loop.ended') + // Each iteration emits a started + ended; two iterations = two pairs. + const startedCount = kinds.filter((k) => k === 'loop.iteration.started').length + const endedCount = kinds.filter((k) => k === 'loop.iteration.ended').length + expect(startedCount).toBe(2) + expect(endedCount).toBe(2) + // Every event references the same runId. + expect(events.every((e) => e.runId === 'fixed-run-id')).toBe(true) + // Decision event follows each iteration. + const decisionCount = kinds.filter((k) => k === 'loop.decision').length + expect(decisionCount).toBeGreaterThanOrEqual(2) + }) + + it('captures per-iteration errors without aborting the whole loop', async () => { + const stub = { + creates: 0, + client: { + async create() { + stub.creates += 1 + if (stub.creates === 1) { + return { + streamPrompt(): AsyncIterable { + return { + [Symbol.asyncIterator]: () => ({ + next: () => Promise.reject(new Error('sandbox blew up')), + }), + } + }, + } as unknown as SandboxInstance + } + return { + async *streamPrompt() { + yield { type: 'result', data: { attempt: 2 } } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + }, + } + + const result = await runLoop({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passOnSecond, + task: { goal: 'survive errors' }, + ctx: { sandboxClient: stub.client }, + }) + + expect(result.iterations[0]?.error?.message).toContain('sandbox blew up') + expect(result.iterations[0]?.output).toBeUndefined() + expect(result.iterations[1]?.verdict?.valid).toBe(true) + expect(result.decision).toBe('stop') + }) + + it('aggregates costUsd from llm_call events across iterations', async () => { + const stub = stubClient([ + [ + { type: 'llm_call', data: { tokensIn: 100, tokensOut: 50, costUsd: 0.01, model: 'm' } }, + { type: 'result', data: { attempt: 1 } }, + ], + [ + { type: 'llm_call', data: { tokensIn: 80, tokensOut: 30, costUsd: 0.02, model: 'm' } }, + { type: 'result', data: { attempt: 2 } }, + ], + ]) + const result = await runLoop({ + driver: createRefineDriver(), + agentRun: spec(), + output, + validator: passOnSecond, + task: { goal: 'cost' }, + ctx: { sandboxClient: stub.client }, + }) + expect(result.iterations[0]?.costUsd).toBeCloseTo(0.01, 9) + expect(result.iterations[1]?.costUsd).toBeCloseTo(0.02, 9) + expect(result.costUsd).toBeCloseTo(0.03, 9) + }) + + it('refineWinnerIndex returns the last valid iteration', () => { + expect( + refineWinnerIndex([ + { + index: 0, + task: {} as RefineTask, + agentRunName: 'refiner', + events: [], + startedAt: 0, + endedAt: 0, + costUsd: 0, + verdict: { valid: false, score: 0 }, + }, + { + index: 1, + task: {} as RefineTask, + agentRunName: 'refiner', + events: [], + startedAt: 0, + endedAt: 0, + costUsd: 0, + verdict: { valid: true, score: 1 }, + }, + { + index: 2, + task: {} as RefineTask, + agentRunName: 'refiner', + events: [], + startedAt: 0, + endedAt: 0, + costUsd: 0, + verdict: { valid: false, score: 0 }, + }, + ]), + ).toBe(1) + }) +}) diff --git a/tests/profiles/coder.test.ts b/tests/profiles/coder.test.ts new file mode 100644 index 0000000..4b356c7 --- /dev/null +++ b/tests/profiles/coder.test.ts @@ -0,0 +1,186 @@ +import type { SandboxEvent } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + type CoderOutput, + type CoderTask, + coderProfile, + createCoderValidator, + multiHarnessCoderFanout, +} from '../../src/profiles' + +const ctx = { iteration: 0, signal: new AbortController().signal } + +function diff(filesTouched: string[], plusLines: number, minusLines: number): string { + const out: string[] = [] + for (const path of filesTouched) { + out.push(`diff --git a/${path} b/${path}`) + out.push(`--- a/${path}`) + out.push(`+++ b/${path}`) + for (let i = 0; i < plusLines; i += 1) out.push(`+line ${i}`) + for (let i = 0; i < minusLines; i += 1) out.push(`-line ${i}`) + } + return out.join('\n') +} + +const baseTask: CoderTask = { + goal: 'minor fix', + repoRoot: '/repo', + forbiddenPaths: ['secrets/', 'dist/'], + maxDiffLines: 100, +} + +describe('createCoderValidator — task-bound validator', () => { + it('passes when tests + typecheck + diff size + forbidden-path all clean', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['src/foo.ts'], 10, 5), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 10, deletions: 5 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(true) + // score = 0.5 + 0.3 + 0.2*(1 - 15/100) = 0.5 + 0.3 + 0.17 = 0.97 + expect(verdict.score).toBeCloseTo(0.97, 6) + expect(verdict.scores?.forbiddenPath).toBe(1) + expect(verdict.scores?.diffSize).toBeCloseTo(0.85, 6) + }) + + it('fails hard when a forbidden path is touched', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['secrets/keys.ts'], 1, 0), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 1, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.forbiddenPath).toBe(0) + expect(verdict.notes).toMatch(/forbidden/) + }) + + it('fails hard when diff exceeds maxDiffLines', async () => { + const validator = createCoderValidator({ ...baseTask, maxDiffLines: 5 }) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['src/foo.ts'], 10, 0), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 10, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.diffSize).toBe(0) + expect(verdict.notes).toMatch(/exceeds cap 5/) + }) + + it('fails when tests fail; score still reflects partial credit elsewhere', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['src/foo.ts'], 4, 1), + testResult: { passed: false, output: 'red' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 4, deletions: 1 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.tests).toBe(0) + expect(verdict.scores?.typecheck).toBe(1) + // score = 0 + 0.3 + 0.2*(1 - 5/100) = 0.3 + 0.19 = 0.49 + expect(verdict.score).toBeCloseTo(0.49, 6) + }) + + it('fails when typecheck fails', async () => { + const validator = createCoderValidator(baseTask) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['src/foo.ts'], 2, 0), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: false, output: 'TS2304' }, + diffStats: { filesChanged: 1, insertions: 2, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.scores?.typecheck).toBe(0) + }) + + it('treats subdirectory matches under a forbidden prefix as forbidden', async () => { + const validator = createCoderValidator({ ...baseTask, forbiddenPaths: ['vendor'] }) + const output: CoderOutput = { + branch: 'feat/x', + patch: diff(['vendor/lib/file.ts'], 1, 0), + testResult: { passed: true, output: '' }, + typecheckResult: { passed: true, output: '' }, + diffStats: { filesChanged: 1, insertions: 1, deletions: 0 }, + } + const verdict = await validator.validate(output, ctx) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toMatch(/vendor/) + }) +}) + +describe('coderProfile output adapter', () => { + const preset = coderProfile({ task: baseTask }) + + it('parses a final result event with embedded coder output', () => { + const events: SandboxEvent[] = [ + { type: 'text_delta', data: { text: 'working...' } }, + { + type: 'result', + data: { + result: { + branch: 'feat/y', + patch: diff(['src/foo.ts'], 2, 0), + testResult: { passed: true, output: 'ok' }, + typecheckResult: { passed: true, output: 'ok' }, + diffStats: { filesChanged: 1, insertions: 2, deletions: 0 }, + reviewerNotes: 'lgtm', + }, + }, + }, + ] + const out = preset.output.parse(events) + expect(out.branch).toBe('feat/y') + expect(out.testResult.passed).toBe(true) + expect(out.diffStats.insertions).toBe(2) + expect(out.reviewerNotes).toBe('lgtm') + }) + + it('falls back to parsing a fenced JSON block out of a text delta', () => { + const fenced = + 'Done. Here is the patch summary:\n```json\n' + + JSON.stringify({ + branch: 'feat/z', + patch: '', + testResult: { passed: false, output: 'fail' }, + typecheckResult: { passed: true, output: '' }, + diffStats: { filesChanged: 0, insertions: 0, deletions: 0 }, + }) + + '\n```' + const events: SandboxEvent[] = [{ type: 'text_delta', data: { text: fenced } }] + const out = preset.output.parse(events) + expect(out.branch).toBe('feat/z') + expect(out.testResult.passed).toBe(false) + }) + + it('returns an empty CoderOutput when no structured result is present', () => { + const events: SandboxEvent[] = [{ type: 'text_delta', data: { text: 'hello' } }] + const out = preset.output.parse(events) + expect(out.branch).toBe('') + expect(out.testResult.passed).toBe(false) + expect(out.diffStats.filesChanged).toBe(0) + }) +}) + +describe('multiHarnessCoderFanout — heterogeneous fanout bundle', () => { + it('produces one AgentRunSpec per harness and a fanout driver of matching n', () => { + const bundle = multiHarnessCoderFanout({ harnesses: ['claude-code', 'codex'] }) + expect(bundle.agentRuns).toHaveLength(2) + expect(bundle.agentRuns.map((s) => s.name)).toEqual(['coder-claude-code', 'coder-codex']) + expect(bundle.agentRuns.every((s) => s.profile.tools?.git === true)).toBe(true) + }) +}) diff --git a/tsup.config.ts b/tsup.config.ts index 1453476..7f29af4 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -6,6 +6,8 @@ export default defineConfig({ platform: 'src/platform/index.ts', 'analyst-loop': 'src/analyst-loop/index.ts', agent: 'src/agent/index.ts', + loops: 'src/loops/index.ts', + profiles: 'src/profiles/index.ts', }, format: ['esm'], dts: true,