diff --git a/flake.nix b/flake.nix index 9c5ce3b..2aef96e 100644 --- a/flake.nix +++ b/flake.nix @@ -49,9 +49,33 @@ { frigate = pkgs.frigate; default = pkgs.frigate; + + # frigate-bench: measure bitcoind RPC cost with frigate's actual + # call patterns and HTTP keep-alive (no fresh-curl-per-call TCP + # handshake artifacts). Run via `nix run .#frigate-bench` — see + # `apps..frigate-bench` below. + # + # Style/whitespace warnings (E501 line length, E226/E241 spacing, + # E265 block-comment style) are ignored — they fight with the + # script's print-table alignment and aren't worth contorting for. + frigate-bench = pkgs.writers.writePython3Bin "frigate-bench" { + flakeIgnore = [ + "E501" + "E226" + "E241" + "E265" + ]; + } (builtins.readFile ./tools/frigate-bench/bench.py); } ); + apps = forAllSystems (system: { + frigate-bench = { + type = "app"; + program = "${self.packages.${system}.frigate-bench}/bin/frigate-bench"; + }; + }); + nixosModules = { frigate = ./modules/frigate.nix; hetzner-bare-metal = ./modules/presets/hetzner-bare-metal.nix; diff --git a/tools/frigate-bench/README.md b/tools/frigate-bench/README.md new file mode 100644 index 0000000..1e65ad9 --- /dev/null +++ b/tools/frigate-bench/README.md @@ -0,0 +1,88 @@ +# frigate-bench + +A small benchmark that measures bitcoind RPC cost the way Frigate experiences +it: HTTP keep-alive on one persistent TCP connection, hitting the call patterns +Frigate's hot loops actually use. Useful for comparing a loopback deployment +against a remote-backend deployment (e.g. `frigate-edge` over WireGuard) and +seeing how much network latency the deployment actually pays. + +## Why this exists + +A naive benchmark like `for i in 1..100; do curl ... ; done` measures TCP +handshakes more than it measures RPC cost — every `curl` invocation opens a +fresh connection and pays the 1.5-RTT 3-way-handshake before any data moves. +Frigate's Java HTTP client doesn't do that; it keeps one connection open and +streams requests. This script mirrors that behavior so the numbers reflect +what Frigate actually sees. + +## What it measures + +| Test | Method | Why | +|---|---|---| +| `getbestblockhash` × 1000 | small in, small out | pure roundtrip latency on a warm connection | +| `getblockcount` × 1000 | small in, small out | same — sanity-check pure latency | +| `getblockhash(tip)` × 1000 | small in, small out | reorg-detection style call | +| `getblockheader(tip, verbose)` × 1000 | small in, ~600 B out | header-fetch style call | +| `getblock(tip, 0)` × 10 | small in, ~1-2 MB raw block hex | initial block sync per-block cost | +| `getblock(tip, 1)` × 10 | small in, ~50-100 KB | block + txids form | +| `getrawmempool` × 1 | small in, ~6 MB on mainnet | bulk mempool listing | +| `getrawtransaction` × 1000 | small in, ~200-2000 B per tx | the inner loop of mempool init / steady-state new-tx scan | + +Then extrapolates the per-tx cost across the *full* current mempool to predict +how long a from-scratch mempool init would take. + +## Usage + +The script reads `user:password` from stdin (so the password doesn't show up in +process listings or shell history) and takes the RPC URL as its only argument. + +```sh +# Run directly with python3: +sudo cat /run/agenix/bitcoind-rpc-creds | python3 bench.py http://127.0.0.1:8332/ + +# Or via the roost flake (uses the python3 in the closure, no host deps): +sudo cat /run/agenix/bitcoind-rpc-creds | nix run github:2140-dev/roost#frigate-bench -- http://127.0.0.1:8332/ +``` + +For an A/B comparison between a loopback consumer and a remote-mesh consumer, +run the same script from both hosts pointing at the same bitcoind instance: + +```sh +# Loopback (on the box running bitcoind) +sudo cat /run/agenix/bitcoind-rpc-creds \ + | nix run github:2140-dev/roost#frigate-bench -- http://127.0.0.1:8332/ + +# Over mesh (on the edge consumer box) +sudo cat /run/agenix/bitcoind-rpc-creds \ + | nix run github:2140-dev/roost#frigate-bench -- http://10.42.0.1:8332/ +``` + +The two outputs are directly comparable per-line. + +## Interpreting the numbers + +- The **pure-latency tests** show TCP+HTTP overhead per call on the link. With + keep-alive, this is approximately one RTT per call — so an inter-DC link at + ~25 ms RTT gives ~25-30 ms per call, while loopback gives ~1 ms. +- The **big-payload tests** are dominated by bandwidth, not latency. The ratio + between local and remote here tells you how much link throughput hurts. +- The **`getrawtransaction` sample** is the single most relevant signal for + predicting Frigate startup cost — frigate calls this per mempool entry on + first start. The summary extrapolates to the current mempool size. + +A multi-box arrangement where the edge has a fast link to the backend will +look fine on pure-latency tests but pay ~100-300x in mempool-init wall time +compared to loopback. That's a one-time cost per Frigate restart, not a +steady-state cost. + +## Not measured + +- **Fulcrum/Electrum protocol** — Frigate proxies non-SP queries to fulcrum, + but client-driven traffic is low volume in normal operation. If that becomes + a concern, this script could be extended to probe Electrum over TCP/TLS. +- **ZMQ `sequence` delivery latency** — Frigate's steady-state path receives + bitcoind tx/block events over ZMQ, then fetches each new tx via RPC. ZMQ + delivery is push-based and not amenable to synthetic benchmarking; measure + it from frigate's own logs (`INFO Subscribed to ZMQ sequence publisher` + appears once subscribed; live tx ingestion latency is visible by comparing + the bitcoind mempool-accept time with frigate's processing time). diff --git a/tools/frigate-bench/bench.py b/tools/frigate-bench/bench.py new file mode 100644 index 0000000..518b2b8 --- /dev/null +++ b/tools/frigate-bench/bench.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python3 +"""Measure bitcoind RPC cost as the Frigate workload experiences it. + +Frigate's hot loops on bitcoind, in order of how much they show up: + + - mempool-init: `getrawmempool` once, then `getrawtransaction(txid, false)` + per mempool entry. On mainnet that's tens of thousands of small sequential + RPCs. This phase dominates the latency a frigate consumer notices on + first startup or after a service restart. + - block sync: `getblockhash(h)` + `getblock(hash, 0)` per height. Big payload + per call (~1-2 MB raw block hex on mainnet), bandwidth-bound rather than + latency-bound. + - reorg/probe: `getblockchaininfo`, `getblockhash`, `getblockheader` — + small calls, infrequent in steady state. + - steady-state per new tx (with ZMQ active): one `getrawtransaction` per + transaction frigate is notified about. + +This benchmark uses HTTP keep-alive on a single persistent TCP connection, +matching what frigate's Java HTTP client actually does. Naive curl-per-call +benchmarks measure TCP handshake more than RPC cost, which overstates the +latency penalty of a remote backend dramatically. + +USAGE + frigate-bench # reads `user:password` from stdin + +EXAMPLES + sudo cat /run/agenix/bitcoind-rpc-creds \\ + | frigate-bench http://127.0.0.1:8332/ # loopback baseline + + sudo cat /run/agenix/bitcoind-rpc-creds \\ + | frigate-bench http://10.42.0.1:8332/ # over a mesh (edge consumer) + +OUTPUT + Per-test: total wall time, calls/sec, ms per call. A final summary + extrapolates current mempool-init cost from the per-tx sample. +""" +import http.client +import json +import sys +import time +import base64 +import random +from urllib.parse import urlparse + + +def main() -> None: + if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): + print(__doc__, file=sys.stderr) + sys.exit(0 if {"-h", "--help"} & set(sys.argv) else 1) + + url = sys.argv[1] + creds_line = sys.stdin.readline().strip() + if ":" not in creds_line: + sys.exit("error: stdin must be a single `user:password` line") + user, pw = creds_line.split(":", 1) + auth_header = "Basic " + base64.b64encode(f"{user}:{pw}".encode()).decode() + + parsed = urlparse(url) + host = parsed.hostname + port = parsed.port or 8332 + if host is None: + sys.exit(f"error: could not parse host from {url!r}") + + conn = http.client.HTTPConnection(host, port, timeout=60) + + def call(method, params, rid=0): + body = json.dumps({"jsonrpc": "1.0", "id": str(rid), "method": method, "params": params}) + conn.request("POST", "/", body, { + "Authorization": auth_header, + "Content-Type": "application/json", + "Connection": "keep-alive", + }) + resp = conn.getresponse() + data = resp.read() + if resp.status != 200: + raise RuntimeError(f"HTTP {resp.status}: {data[:200]!r}") + decoded = json.loads(data) + if decoded.get("error"): + raise RuntimeError(f"RPC error: {decoded['error']}") + return decoded.get("result") + + def bench(label, n, fn): + t0 = time.monotonic() + for i in range(n): + fn(i) + dt = time.monotonic() - t0 + per_ms = dt * 1000 / n + rate = n / dt if dt > 0 else float("inf") + print(f" {label:45} n={n:5} total={dt:7.3f}s {per_ms:7.3f} ms/call {rate:7.1f} call/s") + return per_ms + + # Warm the persistent connection — first call pays TCP+TLS setup, not RPC. + call("uptime", []) + + print(f"target: {url}") + print() + + print("[1] pure latency — small request, small response") + bench("getbestblockhash", 1000, lambda i: call("getbestblockhash", [], i)) + bench("getblockcount", 1000, lambda i: call("getblockcount", [], i)) + print() + + tip_hash = call("getbestblockhash", []) + tip_height = call("getblockcount", []) + + print("[2] chain meta — frigate calls these during reorg detection") + bench("getblockhash(tip)", 1000, lambda i: call("getblockhash", [tip_height], i)) + bench("getblockheader(tip, verbose)", 1000, lambda i: call("getblockheader", [tip_hash, True], i)) + print() + + print("[3] big payload — what initial block sync transfers") + bench("getblock(tip, 0) raw block hex", 10, lambda i: call("getblock", [tip_hash, 0], i)) + bench("getblock(tip, 1) txids only", 10, lambda i: call("getblock", [tip_hash, 1], i)) + print() + + print("[4] mempool-init hot loop — the actual frigate startup bottleneck") + t0 = time.monotonic() + mempool = call("getrawmempool", []) + dt = time.monotonic() - t0 + n_mempool = len(mempool) + print(f" getrawmempool n= 1 total={dt:7.3f}s ({n_mempool} txids returned)") + + sample_n = min(1000, n_mempool) + if sample_n == 0: + print(" (mempool empty — skipping getrawtransaction sample)") + return + sample = random.sample(mempool, sample_n) + per_tx_ms = bench("getrawtransaction (random sample)", sample_n, + lambda i: call("getrawtransaction", [sample[i], False], i)) + print() + + extrapolated = per_tx_ms * n_mempool / 1000.0 + print("[summary] extrapolated mempool init cost on this link:") + print(f" current mempool: {n_mempool} transactions") + print(f" at {per_tx_ms:.2f} ms/call → ~{extrapolated:.0f} seconds = {extrapolated/60:.1f} minutes") + + +if __name__ == "__main__": + main()