diff --git a/build/Dockerfile.example b/build/Dockerfile.example index 14c75fde8..8a85fb2ba 100644 --- a/build/Dockerfile.example +++ b/build/Dockerfile.example @@ -12,7 +12,7 @@ # Stage 0 # Build the binaries -FROM golang:1.24-alpine +FROM golang:1.26-alpine WORKDIR /app COPY go.mod go.sum ./ RUN go mod download diff --git a/build/dev/Dockerfile b/build/dev/Dockerfile index 722a3f2d4..514fa8ce4 100644 --- a/build/dev/Dockerfile +++ b/build/dev/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1.24-alpine AS fetch +FROM golang:1.26-alpine AS fetch RUN go install github.com/air-verse/air@v1.61.1 WORKDIR /app COPY . . diff --git a/build/test/Dockerfile.mock b/build/test/Dockerfile.mock index 7afe35142..9330c0fc4 100644 --- a/build/test/Dockerfile.mock +++ b/build/test/Dockerfile.mock @@ -1,4 +1,4 @@ -FROM golang:1.24-alpine +FROM golang:1.26-alpine WORKDIR /app COPY . . RUN go build -o /bin/mock ./cmd/destinations/mock diff --git a/cmd/bench/metrics/Makefile b/cmd/bench/metrics/Makefile new file mode 100644 index 000000000..cbaf96dda --- /dev/null +++ b/cmd/bench/metrics/Makefile @@ -0,0 +1,83 @@ +# Metrics Benchmarks Makefile +# Usage: cd cmd/bench/metrics && make + +OUTPOST_ROOT := $(shell git rev-parse --show-toplevel) +PG_URL := postgres://outpost:outpost@localhost:5488/bench?sslmode=disable +CH_ADDR := localhost:9009 +CH_DB := bench +PG_ROWS := 10000000 +CH_ROWS := 10000000 + +# ── Infra ──────────────────────────────────────────────────────────────────── + +up/pg: + docker compose -f pg/docker-compose.yml up -d + +up/ch: + docker compose -f ch/docker-compose.yml up -d + +up: up/pg up/ch + +down/pg: + docker compose -f pg/docker-compose.yml down -v + +down/ch: + docker compose -f ch/docker-compose.yml down -v + +down: down/pg down/ch + +# ── Migrations ─────────────────────────────────────────────────────────────── + +migrate/pg: + cd $(OUTPOST_ROOT) && BENCH_PG_URL="$(PG_URL)" \ + go test -run='^$$' -bench=BenchmarkPG -benchtime=1x ./cmd/bench/metrics/ + +migrate/ch: + cd $(OUTPOST_ROOT) && BENCH_CH_ADDR="$(CH_ADDR)" BENCH_CH_DB="$(CH_DB)" \ + go test -run='^$$' -bench=BenchmarkCH -benchtime=1x ./cmd/bench/metrics/ + +migrate: migrate/pg migrate/ch + +# ── Seeding ────────────────────────────────────────────────────────────────── + +seed/pg: + psql "$(PG_URL)" -v ROWS=$(PG_ROWS) -f pg/seed.sql + +seed/ch: + clickhouse client --port 9009 --database $(CH_DB) \ + --param_rows $(CH_ROWS) < ch/seed.sql + +# ── Benchmarks ─────────────────────────────────────────────────────────────── + +bench/pg: + cd $(OUTPOST_ROOT) && BENCH_PG_URL="$(PG_URL)" \ + go test -bench=BenchmarkPG -benchtime=1x -count=1 -timeout=30m ./cmd/bench/metrics/ + +bench/ch: + cd $(OUTPOST_ROOT) && BENCH_CH_ADDR="$(CH_ADDR)" BENCH_CH_DB="$(CH_DB)" \ + go test -bench=BenchmarkCH -benchtime=1x -count=1 -timeout=30m ./cmd/bench/metrics/ + +bench/pg/sustained: + cd $(OUTPOST_ROOT) && BENCH_PG_URL="$(PG_URL)" \ + go test -bench=BenchmarkPG -benchtime=10s -count=3 -timeout=30m ./cmd/bench/metrics/ + +bench/ch/sustained: + cd $(OUTPOST_ROOT) && BENCH_CH_ADDR="$(CH_ADDR)" BENCH_CH_DB="$(CH_DB)" \ + go test -bench=BenchmarkCH -benchtime=10s -count=3 -timeout=30m ./cmd/bench/metrics/ + +bench: bench/pg bench/ch + +# ── Full workflow ──────────────────────────────────────────────────────────── + +setup/pg: up/pg migrate/pg seed/pg +setup/ch: up/ch migrate/ch seed/ch +setup: setup/pg setup/ch + +reset/pg: down/pg up/pg migrate/pg seed/pg +reset/ch: down/ch up/ch migrate/ch seed/ch + +.PHONY: up/pg up/ch up down/pg down/ch down \ + migrate/pg migrate/ch migrate \ + seed/pg seed/ch \ + bench/pg bench/ch bench/pg/sustained bench/ch/sustained bench \ + setup/pg setup/ch setup reset/pg reset/ch diff --git a/cmd/bench/metrics/README.md b/cmd/bench/metrics/README.md new file mode 100644 index 000000000..370c5b2eb --- /dev/null +++ b/cmd/bench/metrics/README.md @@ -0,0 +1,63 @@ +# Metrics Benchmarks + +Benchmarks `QueryEventMetrics` / `QueryAttemptMetrics` against PostgreSQL and ClickHouse. + +Shared test cases in `bench_test.go`, backend-specific setup in `pg_test.go` / `ch_test.go`. + +## Quick Start + +```bash +cd outpost/cmd/bench/metrics + +# ── ClickHouse ─────────────────────────────── +make setup/ch # up + migrate + seed (10M) +make bench/ch # single iteration +make bench/ch/sustained # 10s x 3 runs +make down/ch # cleanup + +# ── PostgreSQL ─────────────────────────────── +make setup/pg # up + migrate + seed (10M) +make bench/pg # single iteration +make bench/pg/sustained # 10s x 3 runs +make down/pg # cleanup + +# ── Both ───────────────────────────────────── +make setup # setup both +make bench # bench both +make down # cleanup both +``` + +### Individual steps + +```bash +make up/ch # start container +make migrate/ch # run migrations +make seed/ch # seed data (default 10M, override: make seed/ch CH_ROWS=1000000000) +make bench/ch # run benchmarks +make reset/ch # down + up + migrate + seed (fresh start) +``` + +Same targets available for `/pg`. + +## Structure + +``` +metrics/ + Makefile # all commands + bench_test.go # shared test cases + date ranges + helpers + pg_test.go # PG setup (BENCH_PG_URL) + ch_test.go # CH setup (BENCH_CH_ADDR) + pg/ # PG infra (docker-compose, seed.sql) + ch/ # CH infra (docker-compose, seed.sql, config/) +``` + +## Data Distribution + +Deterministic via modulo arithmetic (identical for both backends): + +- **2 tenants** — `tenant_0` (90%), `tenant_1` (10%) +- **500 destinations** — `dest_0` through `dest_499` +- **3 topics** — `order.created`, `order.updated`, `payment.received` +- **Time** — evenly spread across January 2000 +- **Attempts** — chained retries (1 event -> 1-4 attempts), 0.5% permanently failed +- 10M events -> ~12.6M attempts (22.6M total rows) diff --git a/cmd/bench/metrics/bench_test.go b/cmd/bench/metrics/bench_test.go new file mode 100644 index 000000000..0cb59bf68 --- /dev/null +++ b/cmd/bench/metrics/bench_test.go @@ -0,0 +1,465 @@ +package metrics + +import ( + "context" + "testing" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +// ── Time ranges ───────────────────────────────────────────────────────────── + +var ( + // Full month — all seeded data lives here. + fullMonth = driver.TimeRange{ + Start: time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 2, 1, 0, 0, 0, 0, time.UTC), + } + oneDay = driver.TimeRange{ + Start: time.Date(2000, 1, 15, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 16, 0, 0, 0, 0, time.UTC), + } + oneWeek = driver.TimeRange{ + Start: time.Date(2000, 1, 8, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 15, 0, 0, 0, 0, time.UTC), + } +) + +// ── Helpers ───────────────────────────────────────────────────────────────── + +func hourly() *driver.Granularity { return &driver.Granularity{Value: 1, Unit: "h"} } +func daily() *driver.Granularity { return &driver.Granularity{Value: 1, Unit: "d"} } +func twoDays() *driver.Granularity { return &driver.Granularity{Value: 2, Unit: "d"} } +func weekly() *driver.Granularity { return &driver.Granularity{Value: 1, Unit: "w"} } +func monthly() *driver.Granularity { return &driver.Granularity{Value: 1, Unit: "M"} } + +func tenant0() map[string][]string { return map[string][]string{"tenant_id": {"tenant_0"}} } + +func withTenant0(extra map[string][]string) map[string][]string { + m := tenant0() + for k, v := range extra { + m[k] = v + } + return m +} + +// ── Event Benchmarks ──────────────────────────────────────────────────────── + +var eventCases = []struct { + name string + req driver.MetricsRequest +}{ + { + name: "CountAll", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "RateAll", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"rate"}, + Filters: tenant0(), + }, + }, + { + name: "CountAndRate", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count", "rate"}, + Filters: tenant0(), + }, + }, + { + name: "CountByTopic", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"topic"}, + Filters: tenant0(), + }, + }, + { + name: "CountByDestination", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"destination_id"}, + Filters: tenant0(), + }, + }, + { + name: "CountByTenant", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"tenant_id"}, + }, + }, + { + name: "Hourly_1Day", + req: driver.MetricsRequest{ + TimeRange: oneDay, + Granularity: hourly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Hourly_1Week", + req: driver.MetricsRequest{ + TimeRange: oneWeek, + Granularity: hourly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Daily_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: daily(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "TwoDays_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: twoDays(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Weekly_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: weekly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Monthly_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: monthly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "RateHourly_1Day", + req: driver.MetricsRequest{ + TimeRange: oneDay, + Granularity: hourly(), + Measures: []string{"rate"}, + Filters: tenant0(), + }, + }, + { + name: "FilterByTopic", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"topic": {"order.created"}}), + }, + }, + { + name: "FilterByDestination", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"destination_id": {"dest_0"}}), + }, + }, + { + name: "SmallTenant", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: daily(), + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {"tenant_1"}}, + }, + }, +} + +// ── Attempt Benchmarks ────────────────────────────────────────────────────── + +var attemptCases = []struct { + name string + req driver.MetricsRequest +}{ + { + name: "CountAll", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "RateAll", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"rate"}, + Filters: tenant0(), + }, + }, + { + name: "SuccessfulRate", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"successful_rate"}, + Filters: tenant0(), + }, + }, + { + name: "FailedRate", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"failed_rate"}, + Filters: tenant0(), + }, + }, + { + name: "CountByTopic", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"topic"}, + Filters: tenant0(), + }, + }, + { + name: "CountByDestination", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"destination_id"}, + Filters: tenant0(), + }, + }, + { + name: "CountByStatus", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"status"}, + Filters: tenant0(), + }, + }, + { + name: "CountByCode", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"code"}, + Filters: tenant0(), + }, + }, + { + name: "CountByAttemptNumber", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"attempt_number"}, + Filters: tenant0(), + }, + }, + { + name: "Hourly_1Day", + req: driver.MetricsRequest{ + TimeRange: oneDay, + Granularity: hourly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Hourly_1Week", + req: driver.MetricsRequest{ + TimeRange: oneWeek, + Granularity: hourly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Daily_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: daily(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "TwoDays_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: twoDays(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "Weekly_1Month", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: weekly(), + Measures: []string{"count"}, + Filters: tenant0(), + }, + }, + { + name: "AllMeasures", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{ + "count", + "successful_count", + "failed_count", + "error_rate", + "first_attempt_count", + "retry_count", + "manual_retry_count", + "avg_attempt_number", + "rate", + "successful_rate", + "failed_rate", + }, + Filters: tenant0(), + }, + }, + { + name: "AllMeasures_Daily", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: daily(), + Measures: []string{ + "count", + "successful_count", + "failed_count", + "error_rate", + "rate", + "successful_rate", + "failed_rate", + }, + Filters: tenant0(), + }, + }, + { + name: "FilterByStatus", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"status": {"failed"}}), + }, + }, + { + name: "FilterByCode", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"code": {"500"}}), + }, + }, + { + name: "FilterByManual", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"manual": {"true"}}), + }, + }, + { + name: "FilterByAttemptNumber", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"attempt_number": {"1"}}), + }, + }, + { + name: "FilterByTopic", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{"topic": {"order.created"}}), + }, + }, + { + name: "MultiDimension", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Dimensions: []string{"topic", "destination_id", "status"}, + Filters: tenant0(), + }, + }, + { + name: "MultiFilter", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Measures: []string{"count"}, + Filters: withTenant0(map[string][]string{ + "status": {"failed"}, + "topic": {"order.created"}, + }), + }, + }, + { + name: "SmallTenant", + req: driver.MetricsRequest{ + TimeRange: fullMonth, + Granularity: daily(), + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {"tenant_1"}}, + }, + }, +} + +func benchmarkEventMetrics(b *testing.B, store driver.Metrics) { + ctx := context.Background() + + for _, tc := range eventCases { + b.Run(tc.name, func(b *testing.B) { + // Warm up. + if _, err := store.QueryEventMetrics(ctx, tc.req); err != nil { + b.Fatalf("warmup: %v", err) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := store.QueryEventMetrics(ctx, tc.req); err != nil { + b.Fatalf("query: %v", err) + } + } + }) + } +} + +func benchmarkAttemptMetrics(b *testing.B, store driver.Metrics) { + ctx := context.Background() + + for _, tc := range attemptCases { + b.Run(tc.name, func(b *testing.B) { + // Warm up. + if _, err := store.QueryAttemptMetrics(ctx, tc.req); err != nil { + b.Fatalf("warmup: %v", err) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, err := store.QueryAttemptMetrics(ctx, tc.req); err != nil { + b.Fatalf("query: %v", err) + } + } + }) + } +} diff --git a/cmd/bench/metrics/ch/README.md b/cmd/bench/metrics/ch/README.md new file mode 100644 index 000000000..77bfff5fc --- /dev/null +++ b/cmd/bench/metrics/ch/README.md @@ -0,0 +1,65 @@ +# CH Metrics Benchmark + +Benchmarks `QueryEventMetrics` / `QueryAttemptMetrics` against ClickHouse (2 CPU, 8GB). + +## Prerequisites + +- Docker (Compose v2) +- Go 1.24+ +- `clickhouse` CLI + +## Quick Start + +```bash +cd outpost + +# 1. Start CH +docker compose -f cmd/bench/metrics/ch/docker-compose.yml up -d + +# 2. Run migrations +BENCH_CH_ADDR="localhost:9009" \ + go test -run='^$' -bench=BenchmarkCH -benchtime=1x ./cmd/bench/metrics/ + +# 3. Seed (default 10M — adjust --param_rows N) +clickhouse client --port 9009 --database bench \ + --param_rows 10000000 < cmd/bench/metrics/ch/seed.sql + +# 4a. Single iteration +BENCH_CH_ADDR="localhost:9009" \ + go test -bench=BenchmarkCH -benchtime=1x -count=1 -timeout=30m ./cmd/bench/metrics/ + +# 4b. Sustained (10s x 3 runs) +BENCH_CH_ADDR="localhost:9009" \ + go test -bench=BenchmarkCH -benchtime=10s -count=3 -timeout=30m ./cmd/bench/metrics/ + +# 5. Cleanup +docker compose -f cmd/bench/metrics/ch/docker-compose.yml down -v +``` + +## Re-seeding + +```bash +docker compose -f cmd/bench/metrics/ch/docker-compose.yml down -v +docker compose -f cmd/bench/metrics/ch/docker-compose.yml up -d +# Repeat steps 2-4 +``` + +## Data Distribution + +Deterministic via modulo arithmetic (shared with PG bench): + +- **2 tenants** — `tenant_0` (90%), `tenant_1` (10%) +- **500 destinations** — `dest_0` through `dest_499` +- **3 topics** — `order.created`, `order.updated`, `payment.received` +- **Time** — evenly spread across January 2000 +- **Attempts** — chained retries (1 event -> 1-4 attempts), 0.5% permanently failed +- 10M events -> ~12.6M attempts + +## Resource Tuning + +| Setting | Default | Purpose | +|---------|---------|---------| +| CPUs | 2 | Parallel query threads | +| Memory | 8GB | Container limit | +| max_memory_usage | 6GB | Per-query memory limit | +| max_threads | 2 | Query parallelism | diff --git a/cmd/bench/metrics/ch/config/users.xml b/cmd/bench/metrics/ch/config/users.xml new file mode 100644 index 000000000..797d914bd --- /dev/null +++ b/cmd/bench/metrics/ch/config/users.xml @@ -0,0 +1,8 @@ + + + + 6000000000 + 2 + + + diff --git a/cmd/bench/metrics/ch/docker-compose.yml b/cmd/bench/metrics/ch/docker-compose.yml new file mode 100644 index 000000000..f7f4cc1e3 --- /dev/null +++ b/cmd/bench/metrics/ch/docker-compose.yml @@ -0,0 +1,26 @@ +services: + clickhouse: + image: clickhouse/clickhouse-server:24 + environment: + CLICKHOUSE_DB: bench + CLICKHOUSE_USER: default + CLICKHOUSE_PASSWORD: "" + ports: + - "9009:9000" # native protocol + - "8124:8123" # HTTP + deploy: + resources: + limits: + cpus: "2" + memory: 8g + ulimits: + nofile: + soft: 262144 + hard: 262144 + volumes: + - ./config/users.xml:/etc/clickhouse-server/users.d/bench.xml:ro + healthcheck: + test: ["CMD-SHELL", "clickhouse-client --query 'SELECT 1'"] + interval: 2s + timeout: 5s + retries: 10 diff --git a/cmd/bench/metrics/ch/seed.sql b/cmd/bench/metrics/ch/seed.sql new file mode 100644 index 000000000..84efb697e --- /dev/null +++ b/cmd/bench/metrics/ch/seed.sql @@ -0,0 +1,245 @@ +-- seed.sql — Deterministic bulk seeding for CH metrics benchmarks. +-- +-- Usage: +-- clickhouse client --port 9009 --database bench --param_rows 1000000000 < cmd/bench/metrics/ch/seed.sql +-- +-- Default rows = 1000000000 (1B). Override with --param_rows N. +-- +-- Distribution (same as PG bench): +-- 2 tenants: tenant_0 gets 90%, tenant_1 gets 10% +-- Time: evenly spread across January 2000 (2000-01-01 to 2000-02-01) +-- +-- Attempt chain (1 event -> 1-4 attempts): +-- attempt 0: all events. Failed if n%5=0 (20%) +-- attempt 1: failed attempt 0. Failed if n%20=0 (25% of retries) +-- attempt 2: failed attempt 1. Failed if n%100=0 (20% of retries) +-- attempt 3: failed attempt 2. Failed if n%200=0 (50% of retries) +-- +-- For 1B events -> ~1.26B attempts. 0.5% events permanently failed. + +SELECT concat('Seeding ', toString({rows:UInt64}), ' events + chained attempts...') AS message; + +-- ============================================================================ +-- 1. Bulk INSERT into events +-- ============================================================================ +-- +-- Tenants: n%10 == 0 -> tenant_1 (10%), else tenant_0 (90%) +-- Destinations: dest_(n%500) [500 destinations] +-- Topics: n%3 -> order.created / order.updated / payment.received +-- Time: Even spread across 2000-01-01 to 2000-02-01 +-- eligible_for_retry: n%3 != 2 + +SELECT '[1/7] Inserting events...' AS message; + +INSERT INTO events (event_id, tenant_id, destination_id, topic, eligible_for_retry, event_time, metadata, data) +SELECT + concat('evt_', toString(number)) AS event_id, + if(number % 10 = 0, 'tenant_1', 'tenant_0') AS tenant_id, + concat('dest_', toString(number % 500)) AS destination_id, + multiIf( + number % 3 = 0, 'order.created', + number % 3 = 1, 'order.updated', + 'payment.received' + ) AS topic, + number % 3 != 2 AS eligible_for_retry, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) AS event_time, + '{}' AS metadata, + '{}' AS data +FROM numbers({rows:UInt64}); + +-- ============================================================================ +-- 2. Bulk INSERT into attempts (chained retries) +-- ============================================================================ +-- +-- Each attempt's time = event_time + (attempt_number * 1 second). +-- manual: only attempt_number >= 3 AND n%10=9 (10% of late retries). +-- Code: success->200/201, failed->500/422 (alternating on n%2). + +SELECT '[2/7] Inserting attempt 1 (all events)...' AS message; + +INSERT INTO attempts ( + event_id, tenant_id, destination_id, topic, eligible_for_retry, event_time, metadata, data, + attempt_id, status, attempt_time, code, response_data, manual, attempt_number +) +SELECT + concat('evt_', toString(number)) AS event_id, + if(number % 10 = 0, 'tenant_1', 'tenant_0') AS tenant_id, + concat('dest_', toString(number % 500)) AS destination_id, + multiIf( + number % 3 = 0, 'order.created', + number % 3 = 1, 'order.updated', + 'payment.received' + ) AS topic, + number % 3 != 2 AS eligible_for_retry, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) AS event_time, + '{}' AS metadata, + '{}' AS data, + concat('att_', toString(number), '_0') AS attempt_id, + if(number % 5 = 0, 'failed', 'success') AS status, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) + + toIntervalSecond(1) AS attempt_time, + multiIf( + number % 5 != 0 AND number % 2 = 0, '200', + number % 5 != 0, '201', + number % 2 = 0, '500', + '422' + ) AS code, + '' AS response_data, + false AS manual, + toUInt32(1) AS attempt_number +FROM numbers({rows:UInt64}); + +SELECT '[3/7] Inserting attempt 2 (20% of events)...' AS message; + +INSERT INTO attempts ( + event_id, tenant_id, destination_id, topic, eligible_for_retry, event_time, metadata, data, + attempt_id, status, attempt_time, code, response_data, manual, attempt_number +) +SELECT + concat('evt_', toString(number)) AS event_id, + if(number % 10 = 0, 'tenant_1', 'tenant_0') AS tenant_id, + concat('dest_', toString(number % 500)) AS destination_id, + multiIf( + number % 3 = 0, 'order.created', + number % 3 = 1, 'order.updated', + 'payment.received' + ) AS topic, + number % 3 != 2 AS eligible_for_retry, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) AS event_time, + '{}' AS metadata, + '{}' AS data, + concat('att_', toString(number), '_1') AS attempt_id, + if(number % 20 = 0, 'failed', 'success') AS status, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) + + toIntervalSecond(2) AS attempt_time, + multiIf( + number % 20 != 0 AND number % 2 = 0, '200', + number % 20 != 0, '201', + number % 2 = 0, '500', + '422' + ) AS code, + '' AS response_data, + false AS manual, + toUInt32(2) AS attempt_number +FROM numbers({rows:UInt64}) +WHERE number % 5 = 0; + +SELECT '[4/7] Inserting attempt 3 (5% of events)...' AS message; + +INSERT INTO attempts ( + event_id, tenant_id, destination_id, topic, eligible_for_retry, event_time, metadata, data, + attempt_id, status, attempt_time, code, response_data, manual, attempt_number +) +SELECT + concat('evt_', toString(number)) AS event_id, + if(number % 10 = 0, 'tenant_1', 'tenant_0') AS tenant_id, + concat('dest_', toString(number % 500)) AS destination_id, + multiIf( + number % 3 = 0, 'order.created', + number % 3 = 1, 'order.updated', + 'payment.received' + ) AS topic, + number % 3 != 2 AS eligible_for_retry, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) AS event_time, + '{}' AS metadata, + '{}' AS data, + concat('att_', toString(number), '_2') AS attempt_id, + if(number % 100 = 0, 'failed', 'success') AS status, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) + + toIntervalSecond(3) AS attempt_time, + multiIf( + number % 100 != 0 AND number % 2 = 0, '200', + number % 100 != 0, '201', + number % 2 = 0, '500', + '422' + ) AS code, + '' AS response_data, + number % 10 = 9 AS manual, + toUInt32(3) AS attempt_number +FROM numbers({rows:UInt64}) +WHERE number % 20 = 0; + +SELECT '[5/7] Inserting attempt 4 (1% of events)...' AS message; + +INSERT INTO attempts ( + event_id, tenant_id, destination_id, topic, eligible_for_retry, event_time, metadata, data, + attempt_id, status, attempt_time, code, response_data, manual, attempt_number +) +SELECT + concat('evt_', toString(number)) AS event_id, + if(number % 10 = 0, 'tenant_1', 'tenant_0') AS tenant_id, + concat('dest_', toString(number % 500)) AS destination_id, + multiIf( + number % 3 = 0, 'order.created', + number % 3 = 1, 'order.updated', + 'payment.received' + ) AS topic, + number % 3 != 2 AS eligible_for_retry, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) AS event_time, + '{}' AS metadata, + '{}' AS data, + concat('att_', toString(number), '_3') AS attempt_id, + if(number % 200 = 0, 'failed', 'success') AS status, + toDateTime64('2000-01-01', 3) + + toIntervalMillisecond( + toUInt64(number * 2678400000 / {rows:UInt64}) + ) + + toIntervalSecond(4) AS attempt_time, + multiIf( + number % 200 != 0 AND number % 2 = 0, '200', + number % 200 != 0, '201', + number % 2 = 0, '500', + '422' + ) AS code, + '' AS response_data, + number % 10 = 9 AS manual, + toUInt32(4) AS attempt_number +FROM numbers({rows:UInt64}) +WHERE number % 100 = 0; + +-- ============================================================================ +-- 3. OPTIMIZE (force ReplacingMergeTree merge) +-- ============================================================================ + +SELECT '[6/7] Optimizing (forcing merge)...' AS message; + +OPTIMIZE TABLE events FINAL; +OPTIMIZE TABLE attempts FINAL; + +-- ============================================================================ +-- 4. Report +-- ============================================================================ + +SELECT '[7/7] Done. Row counts:' AS message; + +SELECT 'events' AS table_name, count() AS row_count FROM events +UNION ALL +SELECT 'attempts' AS table_name, count() AS row_count FROM attempts; + +SELECT attempt_number, status, count() AS cnt +FROM attempts GROUP BY attempt_number, status +ORDER BY attempt_number, status; diff --git a/cmd/bench/metrics/ch_test.go b/cmd/bench/metrics/ch_test.go new file mode 100644 index 000000000..6b54c61ed --- /dev/null +++ b/cmd/bench/metrics/ch_test.go @@ -0,0 +1,73 @@ +package metrics + +import ( + "context" + "os" + "testing" + + "github.com/hookdeck/outpost/internal/clickhouse" + "github.com/hookdeck/outpost/internal/logstore/chlogstore" + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/hookdeck/outpost/internal/migrator" +) + +func newCHBench(tb testing.TB) driver.Metrics { + tb.Helper() + + chAddr := os.Getenv("BENCH_CH_ADDR") + if chAddr == "" { + tb.Skip("BENCH_CH_ADDR not set — skipping CH metrics benchmarks") + } + + chDB := os.Getenv("BENCH_CH_DB") + if chDB == "" { + chDB = "bench" + } + + ctx := context.Background() + + // Run migrations. + m, err := migrator.New(migrator.MigrationOpts{ + CH: migrator.MigrationOptsCH{ + Addr: chAddr, + Database: chDB, + Username: "default", + }, + }) + if err != nil { + tb.Fatalf("migrator: %v", err) + } + _, _, err = m.Up(ctx, -1) + if err != nil { + tb.Fatalf("migrate up: %v", err) + } + srcErr, dbErr := m.Close(ctx) + if srcErr != nil { + tb.Fatalf("migrator close src: %v", srcErr) + } + if dbErr != nil { + tb.Fatalf("migrator close db: %v", dbErr) + } + + conn, err := clickhouse.New(&clickhouse.ClickHouseConfig{ + Addr: chAddr, + Database: chDB, + Username: "default", + }) + if err != nil { + tb.Fatalf("clickhouse: %v", err) + } + tb.Cleanup(func() { conn.Close() }) + + return chlogstore.NewLogStore(conn, "") +} + +func BenchmarkCHEventMetrics(b *testing.B) { + store := newCHBench(b) + benchmarkEventMetrics(b, store) +} + +func BenchmarkCHAttemptMetrics(b *testing.B) { + store := newCHBench(b) + benchmarkAttemptMetrics(b, store) +} diff --git a/cmd/bench/metrics/pg/README.md b/cmd/bench/metrics/pg/README.md new file mode 100644 index 000000000..e35f42204 --- /dev/null +++ b/cmd/bench/metrics/pg/README.md @@ -0,0 +1,66 @@ +# PG Metrics Benchmark + +Benchmarks `QueryEventMetrics` / `QueryAttemptMetrics` against PostgreSQL (2 CPU, 8GB). + +## Prerequisites + +- Docker (Compose v2) +- Go 1.24+ +- `psql` + +## Quick Start + +```bash +cd outpost + +# 1. Start PG +docker compose -f cmd/bench/metrics/pg/docker-compose.yml up -d + +# 2. Run migrations +BENCH_PG_URL="postgres://outpost:outpost@localhost:5488/bench?sslmode=disable" \ + go test -run='^$' -bench=BenchmarkPG -benchtime=1x ./cmd/bench/metrics/ + +# 3. Seed (default 10M — adjust -v ROWS=N) +psql "postgres://outpost:outpost@localhost:5488/bench?sslmode=disable" \ + -v ROWS=10000000 -f cmd/bench/metrics/pg/seed.sql + +# 4a. Single iteration +BENCH_PG_URL="postgres://outpost:outpost@localhost:5488/bench?sslmode=disable" \ + go test -bench=BenchmarkPG -benchtime=1x -count=1 -timeout=30m ./cmd/bench/metrics/ + +# 4b. Sustained (10s x 3 runs) +BENCH_PG_URL="postgres://outpost:outpost@localhost:5488/bench?sslmode=disable" \ + go test -bench=BenchmarkPG -benchtime=10s -count=3 -timeout=30m ./cmd/bench/metrics/ + +# 5. Cleanup +docker compose -f cmd/bench/metrics/pg/docker-compose.yml down -v +``` + +## Re-seeding + +```bash +docker compose -f cmd/bench/metrics/pg/docker-compose.yml down -v +docker compose -f cmd/bench/metrics/pg/docker-compose.yml up -d +# Repeat steps 2-4 +``` + +## Data Distribution + +Deterministic via modulo arithmetic (shared with CH bench): + +- **2 tenants** — `tenant_0` (90%), `tenant_1` (10%) +- **500 destinations** — `dest_0` through `dest_499` +- **3 topics** — `order.created`, `order.updated`, `payment.received` +- **Time** — evenly spread across January 2000 +- **Attempts** — chained retries (1 event -> 1-4 attempts), 0.5% permanently failed +- 10M events -> ~12.6M attempts + +## Resource Tuning + +| Setting | Default | Purpose | +|---------|---------|---------| +| CPUs | 2 | Parallel query workers | +| Memory | 4GB | Container limit | +| shared_buffers | 1GB | PG buffer pool | +| work_mem | 256MB | Per-sort/hash memory | +| effective_cache_size | 3GB | Planner hint for OS cache | diff --git a/cmd/bench/metrics/pg/docker-compose.yml b/cmd/bench/metrics/pg/docker-compose.yml new file mode 100644 index 000000000..bd0db31e1 --- /dev/null +++ b/cmd/bench/metrics/pg/docker-compose.yml @@ -0,0 +1,27 @@ +services: + postgres: + image: postgres:16-alpine + environment: + POSTGRES_USER: outpost + POSTGRES_PASSWORD: outpost + POSTGRES_DB: bench + ports: + - "5488:5432" + deploy: + resources: + limits: + cpus: "2" + memory: 8g + command: + - postgres + - -c + - shared_buffers=2GB + - -c + - work_mem=512MB + - -c + - effective_cache_size=6GB + healthcheck: + test: ["CMD-SHELL", "pg_isready -U outpost -d bench"] + interval: 2s + timeout: 5s + retries: 10 diff --git a/cmd/bench/metrics/pg/seed.sql b/cmd/bench/metrics/pg/seed.sql new file mode 100644 index 000000000..aff9b45a5 --- /dev/null +++ b/cmd/bench/metrics/pg/seed.sql @@ -0,0 +1,256 @@ +-- seed.sql — Deterministic bulk seeding for PG metrics benchmarks. +-- +-- Usage: +-- psql "$POSTGRES_URL" -v ROWS=10000000 -f cmd/bench/metrics/pg/seed.sql +-- +-- Default :ROWS = 10000000 (10M). Override with -v ROWS=N. +-- +-- Distribution: +-- 2 tenants: tenant_0 gets 90%, tenant_1 gets 10% +-- Time: evenly spread across January 2000 (2000-01-01 to 2000-02-01) +-- No explicit partitions — data lands in the default partition. +-- +-- Attempt chain (1 event → 1-4 attempts): +-- attempt 0: all events. Failed if n%5=0 (20%) +-- attempt 1: failed attempt 0. Failed if n%20=0 (25% of retries) +-- attempt 2: failed attempt 1. Failed if n%100=0 (20% of retries) +-- attempt 3: failed attempt 2. Failed if n%200=0 (50% of retries) +-- +-- For 10M events → ~12.6M attempts. 0.5% events permanently failed. + +\set ON_ERROR_STOP on +\timing on + +-- Default if not supplied via -v +SELECT COALESCE(:'ROWS', '10000000') AS rows_count \gset + +\echo Seeding :rows_count events + chained attempts... + +-- ============================================================================ +-- 1. Bulk INSERT into events +-- ============================================================================ +-- +-- Tenants: n%10 == 0 → tenant_1 (10%), else tenant_0 (90%) +-- Destinations: dest_(n%500) [500 destinations] +-- Topics: n%3 → order.created / order.updated / payment.received +-- Time: Even spread across 2000-01-01 to 2000-02-01 +-- eligible_for_retry: n%3 != 2 + +\echo [1/7] Inserting events... + +INSERT INTO events (id, tenant_id, destination_id, time, topic, eligible_for_retry, data, metadata) +SELECT + 'evt_' || n AS id, + CASE WHEN n % 10 = 0 + THEN 'tenant_1' + ELSE 'tenant_0' + END AS tenant_id, + 'dest_' || (n % 500) AS destination_id, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + AS time, + CASE n % 3 + WHEN 0 THEN 'order.created' + WHEN 1 THEN 'order.updated' + ELSE 'payment.received' + END AS topic, + (n % 3 != 2) AS eligible_for_retry, + '{}' AS data, + '{}'::jsonb AS metadata +FROM generate_series(0, :'rows_count'::int - 1) AS n; + +-- ============================================================================ +-- 2. Bulk INSERT into attempts (chained retries) +-- ============================================================================ +-- +-- Shared columns reuse the same expressions as events. +-- Each attempt's time = event_time + (attempt_number * 1 second). +-- manual: only attempt_number >= 3 AND n%10=9 (10% of late retries). +-- Code: success→200/201, failed→500/422 (alternating on n%2). + +\echo [2/7] Inserting attempt 1 (all events)... + +INSERT INTO attempts ( + id, event_id, tenant_id, destination_id, topic, status, time, + attempt_number, manual, code, response_data, + event_time, eligible_for_retry, event_data, event_metadata +) +SELECT + 'att_' || n || '_0' AS id, + 'evt_' || n AS event_id, + CASE WHEN n % 10 = 0 THEN 'tenant_1' ELSE 'tenant_0' END + AS tenant_id, + 'dest_' || (n % 500) AS destination_id, + CASE n % 3 + WHEN 0 THEN 'order.created' + WHEN 1 THEN 'order.updated' + ELSE 'payment.received' + END AS topic, + CASE WHEN n % 5 = 0 THEN 'failed' ELSE 'success' END AS status, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + + interval '1 second' + AS time, + 1 AS attempt_number, + false AS manual, + CASE + WHEN n % 5 != 0 THEN CASE WHEN n%2=0 THEN '200' ELSE '201' END + ELSE CASE WHEN n%2=0 THEN '500' ELSE '422' END + END AS code, + NULL AS response_data, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + AS event_time, + (n % 3 != 2) AS eligible_for_retry, + '{}' AS event_data, + '{}'::jsonb AS event_metadata +FROM generate_series(0, :'rows_count'::int - 1) AS n; + +\echo [3/7] Inserting attempt 2 (20% of events)... + +INSERT INTO attempts ( + id, event_id, tenant_id, destination_id, topic, status, time, + attempt_number, manual, code, response_data, + event_time, eligible_for_retry, event_data, event_metadata +) +SELECT + 'att_' || n || '_1' AS id, + 'evt_' || n AS event_id, + CASE WHEN n % 10 = 0 THEN 'tenant_1' ELSE 'tenant_0' END + AS tenant_id, + 'dest_' || (n % 500) AS destination_id, + CASE n % 3 + WHEN 0 THEN 'order.created' + WHEN 1 THEN 'order.updated' + ELSE 'payment.received' + END AS topic, + CASE WHEN n % 20 = 0 THEN 'failed' ELSE 'success' END AS status, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + + interval '2 seconds' + AS time, + 2 AS attempt_number, + false AS manual, + CASE + WHEN n % 20 != 0 THEN CASE WHEN n%2=0 THEN '200' ELSE '201' END + ELSE CASE WHEN n%2=0 THEN '500' ELSE '422' END + END AS code, + NULL AS response_data, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + AS event_time, + (n % 3 != 2) AS eligible_for_retry, + '{}' AS event_data, + '{}'::jsonb AS event_metadata +FROM generate_series(0, :'rows_count'::int - 1) AS n +WHERE n % 5 = 0; + +\echo [4/7] Inserting attempt 3 (5% of events)... + +INSERT INTO attempts ( + id, event_id, tenant_id, destination_id, topic, status, time, + attempt_number, manual, code, response_data, + event_time, eligible_for_retry, event_data, event_metadata +) +SELECT + 'att_' || n || '_2' AS id, + 'evt_' || n AS event_id, + CASE WHEN n % 10 = 0 THEN 'tenant_1' ELSE 'tenant_0' END + AS tenant_id, + 'dest_' || (n % 500) AS destination_id, + CASE n % 3 + WHEN 0 THEN 'order.created' + WHEN 1 THEN 'order.updated' + ELSE 'payment.received' + END AS topic, + CASE WHEN n % 100 = 0 THEN 'failed' ELSE 'success' END AS status, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + + interval '3 seconds' + AS time, + 3 AS attempt_number, + (n % 10 = 9) AS manual, + CASE + WHEN n % 100 != 0 THEN CASE WHEN n%2=0 THEN '200' ELSE '201' END + ELSE CASE WHEN n%2=0 THEN '500' ELSE '422' END + END AS code, + NULL AS response_data, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + AS event_time, + (n % 3 != 2) AS eligible_for_retry, + '{}' AS event_data, + '{}'::jsonb AS event_metadata +FROM generate_series(0, :'rows_count'::int - 1) AS n +WHERE n % 20 = 0; + +\echo [5/7] Inserting attempt 4 (1% of events)... + +INSERT INTO attempts ( + id, event_id, tenant_id, destination_id, topic, status, time, + attempt_number, manual, code, response_data, + event_time, eligible_for_retry, event_data, event_metadata +) +SELECT + 'att_' || n || '_3' AS id, + 'evt_' || n AS event_id, + CASE WHEN n % 10 = 0 THEN 'tenant_1' ELSE 'tenant_0' END + AS tenant_id, + 'dest_' || (n % 500) AS destination_id, + CASE n % 3 + WHEN 0 THEN 'order.created' + WHEN 1 THEN 'order.updated' + ELSE 'payment.received' + END AS topic, + CASE WHEN n % 200 = 0 THEN 'failed' ELSE 'success' END AS status, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + + interval '4 seconds' + AS time, + 4 AS attempt_number, + (n % 10 = 9) AS manual, + CASE + WHEN n % 200 != 0 THEN CASE WHEN n%2=0 THEN '200' ELSE '201' END + ELSE CASE WHEN n%2=0 THEN '500' ELSE '422' END + END AS code, + NULL AS response_data, + '2000-01-01'::timestamptz + + (n::double precision / :'rows_count'::double precision) + * ('2000-02-01'::timestamptz - '2000-01-01'::timestamptz) + AS event_time, + (n % 3 != 2) AS eligible_for_retry, + '{}' AS event_data, + '{}'::jsonb AS event_metadata +FROM generate_series(0, :'rows_count'::int - 1) AS n +WHERE n % 100 = 0; + +-- ============================================================================ +-- 3. ANALYZE +-- ============================================================================ + +\echo [6/7] Analyzing... + +ANALYZE events; +ANALYZE attempts; + +-- ============================================================================ +-- 4. Report +-- ============================================================================ + +\echo [7/7] Done. Row counts: + +SELECT 'events' AS table_name, count(*) AS row_count FROM events +UNION ALL +SELECT 'attempts' AS table_name, count(*) AS row_count FROM attempts; + +SELECT attempt_number, status, count(*) AS cnt +FROM attempts GROUP BY attempt_number, status +ORDER BY attempt_number, status; diff --git a/cmd/bench/metrics/pg_test.go b/cmd/bench/metrics/pg_test.go new file mode 100644 index 000000000..deceef4c5 --- /dev/null +++ b/cmd/bench/metrics/pg_test.go @@ -0,0 +1,60 @@ +package metrics + +import ( + "context" + "os" + "testing" + + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/hookdeck/outpost/internal/logstore/pglogstore" + "github.com/hookdeck/outpost/internal/migrator" + "github.com/jackc/pgx/v5/pgxpool" +) + +func newPGBench(tb testing.TB) driver.Metrics { + tb.Helper() + + pgURL := os.Getenv("BENCH_PG_URL") + if pgURL == "" { + tb.Skip("BENCH_PG_URL not set — skipping PG metrics benchmarks") + } + + ctx := context.Background() + + // Run migrations. + m, err := migrator.New(migrator.MigrationOpts{ + PG: migrator.MigrationOptsPG{URL: pgURL}, + }) + if err != nil { + tb.Fatalf("migrator: %v", err) + } + _, _, err = m.Up(ctx, -1) + if err != nil { + tb.Fatalf("migrate up: %v", err) + } + srcErr, dbErr := m.Close(ctx) + if srcErr != nil { + tb.Fatalf("migrator close src: %v", srcErr) + } + if dbErr != nil { + tb.Fatalf("migrator close db: %v", dbErr) + } + + db, err := pgxpool.New(ctx, pgURL) + if err != nil { + tb.Fatalf("pgxpool: %v", err) + } + tb.Cleanup(db.Close) + + return pglogstore.NewLogStore(db) +} + +func BenchmarkPGEventMetrics(b *testing.B) { + store := newPGBench(b) + benchmarkEventMetrics(b, store) +} + +func BenchmarkPGAttemptMetrics(b *testing.B) { + store := newPGBench(b) + benchmarkAttemptMetrics(b, store) +} diff --git a/docs/apis/openapi.yaml b/docs/apis/openapi.yaml index b4df10b4f..59fd21980 100644 --- a/docs/apis/openapi.yaml +++ b/docs/apis/openapi.yaml @@ -2035,6 +2035,69 @@ components: description: Regex pattern for validation (compatible with HTML5 pattern attribute). example: "^[a-zA-Z0-9_]+$" + MetricsDataPoint: + type: object + properties: + time_bucket: + type: + - string + - "null" + format: date-time + description: Start of the time bucket. Null when no granularity is specified. + example: "2026-03-02T14:00:00Z" + dimensions: + type: object + additionalProperties: + type: string + description: Dimension values for this data point. Empty object when no dimensions are requested. + example: + destination_id: "dest_abc" + topic: "user.created" + metrics: + type: object + additionalProperties: {} + description: Requested measure values for this data point. + example: + count: 1423 + error_rate: 0.02 + + MetricsMetadata: + type: object + properties: + granularity: + type: + - string + - "null" + description: The granularity used for time bucketing, or null if none was specified. + example: "1h" + query_time_ms: + type: integer + description: Query execution time in milliseconds. + example: 42 + row_count: + type: integer + description: Number of data points returned. + example: 2 + row_limit: + type: integer + description: Maximum number of rows the query will return. + example: 100000 + truncated: + type: boolean + description: Whether the results were truncated due to hitting the row limit. + example: false + + MetricsResponse: + type: object + properties: + data: + type: array + items: + $ref: "#/components/schemas/MetricsDataPoint" + description: Array of aggregated data points. + metadata: + $ref: "#/components/schemas/MetricsMetadata" + # Security is applied per-operation based on AuthScope tags: @@ -2095,6 +2158,9 @@ tags: - `include=response_data`: Include response body and headers from the attempt - name: Events description: Operations related to event history. + - name: Metrics + description: | + Aggregated metrics for events and delivery attempts. Supports time bucketing, dimensional grouping, and filtering. paths: /healthz: @@ -3730,3 +3796,314 @@ paths: $ref: "#/components/responses/Unauthorized" "500": $ref: "#/components/responses/InternalServerError" + + /metrics/events: + get: + tags: [Metrics] + summary: Get Event Metrics + description: | + Returns aggregated event publish metrics. Supports time bucketing via granularity, + dimensional grouping, and filtering. + + **Measures:** `count`, `rate` + + **Dimensions:** `tenant_id` (admin-only), `topic`, `destination_id` + + **Filters:** `tenant_id` (admin-only), `topic`, `destination_id` + operationId: getEventMetrics + parameters: + - name: time[start] + in: query + required: true + schema: + type: string + format: date-time + description: Start of the time range (inclusive). ISO 8601 timestamp. + example: "2026-03-02T00:00:00Z" + - name: time[end] + in: query + required: true + schema: + type: string + format: date-time + description: End of the time range (exclusive). ISO 8601 timestamp. + example: "2026-03-03T00:00:00Z" + - name: granularity + in: query + required: false + schema: + type: string + description: | + Time bucketing granularity. Pattern: ``. + Units: `s` (1-60), `m` (1-60), `h` (1-24), `d` (1-31), `w` (1-4), `M` (1-12). + When omitted, returns a single aggregate row per dimension combination. + example: "1h" + - name: measures + in: query + required: true + schema: + oneOf: + - type: string + enum: [count, rate] + - type: array + items: + type: string + enum: [count, rate] + description: Measures to compute. At least one required. `rate` is events/second throughput. Use bracket notation for multiple values (e.g., `measures[0]=count`). + example: ["count"] + - name: dimensions + in: query + required: false + schema: + oneOf: + - type: string + enum: [tenant_id, topic, destination_id] + - type: array + items: + type: string + enum: [tenant_id, topic, destination_id] + description: Dimensions to group results by. Use bracket notation for multiple values (e.g., `dimensions[0]=topic&dimensions[1]=destination_id`). + - name: filters[topic] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by topic name(s). Use bracket notation for multiple values (e.g., `filters[topic][0]=user.created&filters[topic][1]=user.updated`). + - name: filters[destination_id] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by destination ID(s). Use bracket notation for multiple values (e.g., `filters[destination_id][0]=d1&filters[destination_id][1]=d2`). + - name: filters[tenant_id] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by tenant ID(s). Admin-only — rejected with 403 for JWT callers. Use bracket notation for multiple values (e.g., `filters[tenant_id][0]=t1&filters[tenant_id][1]=t2`). + responses: + "200": + description: Aggregated event metrics. + content: + application/json: + schema: + $ref: "#/components/schemas/MetricsResponse" + examples: + HourlyEventCount: + value: + data: + - time_bucket: "2026-03-02T14:00:00Z" + dimensions: + topic: "user.created" + metrics: + count: 1423 + - time_bucket: "2026-03-02T15:00:00Z" + dimensions: + topic: "user.created" + metrics: + count: 1891 + metadata: + granularity: "1h" + query_time_ms: 42 + row_count: 2 + row_limit: 100000 + truncated: false + "400": + $ref: "#/components/responses/BadRequest" + "401": + $ref: "#/components/responses/Unauthorized" + "403": + description: JWT caller attempted to use admin-only dimension or filter (tenant_id). + content: + application/json: + schema: + $ref: "#/components/schemas/APIErrorResponse" + "500": + $ref: "#/components/responses/InternalServerError" + + /metrics/attempts: + get: + tags: [Metrics] + summary: Get Attempt Metrics + description: | + Returns aggregated delivery attempt metrics. Supports time bucketing via granularity, + dimensional grouping, and filtering. + + **Measures:** `count`, `successful_count`, `failed_count`, `error_rate`, + `first_attempt_count`, `retry_count`, `manual_retry_count`, `avg_attempt_number`, + `rate`, `successful_rate`, `failed_rate` + + **Dimensions:** `tenant_id` (admin-only), `destination_id`, `topic`, `status`, `code`, `manual`, `attempt_number` + + **Filters:** `tenant_id` (admin-only), `destination_id`, `topic`, `status`, `code`, `manual`, `attempt_number` + operationId: getAttemptMetrics + parameters: + - name: time[start] + in: query + required: true + schema: + type: string + format: date-time + description: Start of the time range (inclusive). ISO 8601 timestamp. + example: "2026-03-02T00:00:00Z" + - name: time[end] + in: query + required: true + schema: + type: string + format: date-time + description: End of the time range (exclusive). ISO 8601 timestamp. + example: "2026-03-03T00:00:00Z" + - name: granularity + in: query + required: false + schema: + type: string + description: | + Time bucketing granularity. Pattern: ``. + Units: `s` (1-60), `m` (1-60), `h` (1-24), `d` (1-31), `w` (1-4), `M` (1-12). + When omitted, returns a single aggregate row per dimension combination. + example: "1h" + - name: measures + in: query + required: true + schema: + oneOf: + - type: string + enum: [count, successful_count, failed_count, error_rate, first_attempt_count, retry_count, manual_retry_count, avg_attempt_number, rate, successful_rate, failed_rate] + - type: array + items: + type: string + enum: [count, successful_count, failed_count, error_rate, first_attempt_count, retry_count, manual_retry_count, avg_attempt_number, rate, successful_rate, failed_rate] + description: Measures to compute. At least one required. Rate measures (`rate`, `successful_rate`, `failed_rate`) are throughput in events/second. Use bracket notation for multiple values (e.g., `measures[0]=count&measures[1]=error_rate`). + example: ["count", "error_rate"] + - name: dimensions + in: query + required: false + schema: + oneOf: + - type: string + enum: [tenant_id, destination_id, topic, status, code, manual, attempt_number] + - type: array + items: + type: string + enum: [tenant_id, destination_id, topic, status, code, manual, attempt_number] + description: Dimensions to group results by. Use bracket notation for multiple values (e.g., `dimensions[0]=status&dimensions[1]=destination_id`). + - name: filters[destination_id] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by destination ID(s). Use bracket notation for multiple values (e.g., `filters[destination_id][0]=d1&filters[destination_id][1]=d2`). + - name: filters[topic] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by topic name(s). Use bracket notation for multiple values (e.g., `filters[topic][0]=user.created&filters[topic][1]=user.updated`). + - name: filters[status] + in: query + required: false + schema: + oneOf: + - type: string + enum: [success, failed] + - type: array + items: + type: string + enum: [success, failed] + description: Filter by attempt status(es). Use bracket notation for multiple values (e.g., `filters[status][0]=success&filters[status][1]=failed`). + - name: filters[code] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by HTTP status code(s). Use bracket notation for multiple values (e.g., `filters[code][0]=200&filters[code][1]=500`). + - name: filters[manual] + in: query + required: false + schema: + type: string + enum: ["true", "false"] + description: Filter by manual retry flag. + - name: filters[attempt_number] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by attempt number(s). Use bracket notation for multiple values (e.g., `filters[attempt_number][0]=1&filters[attempt_number][1]=2`). + - name: filters[tenant_id] + in: query + required: false + schema: + oneOf: + - type: string + - type: array + items: + type: string + description: Filter by tenant ID(s). Admin-only — rejected with 403 for JWT callers. Use bracket notation for multiple values (e.g., `filters[tenant_id][0]=t1&filters[tenant_id][1]=t2`). + responses: + "200": + description: Aggregated attempt metrics. + content: + application/json: + schema: + $ref: "#/components/schemas/MetricsResponse" + examples: + DailyAttemptCounts: + value: + data: + - time_bucket: "2026-03-02T00:00:00Z" + dimensions: + destination_id: "dest_abc" + metrics: + count: 1423 + successful_count: 1393 + failed_count: 30 + error_rate: 0.0211 + metadata: + granularity: "1d" + query_time_ms: 38 + row_count: 1 + row_limit: 100000 + truncated: false + "400": + $ref: "#/components/responses/BadRequest" + "401": + $ref: "#/components/responses/Unauthorized" + "403": + description: JWT caller attempted to use admin-only dimension or filter (tenant_id). + content: + application/json: + schema: + $ref: "#/components/schemas/APIErrorResponse" + "500": + $ref: "#/components/responses/InternalServerError" diff --git a/go.mod b/go.mod index 39b9104bb..d9056f764 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/hookdeck/outpost -go 1.24.0 +go 1.26.0 require ( cloud.google.com/go/pubsub v1.41.0 diff --git a/internal/apirouter/metrics_handlers.go b/internal/apirouter/metrics_handlers.go new file mode 100644 index 000000000..e14d10d44 --- /dev/null +++ b/internal/apirouter/metrics_handlers.go @@ -0,0 +1,447 @@ +package apirouter + +import ( + "context" + "errors" + "fmt" + "net/http" + "regexp" + "strconv" + "time" + + "github.com/gin-gonic/gin" + "github.com/hookdeck/outpost/internal/logging" + "github.com/hookdeck/outpost/internal/logstore" + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +// logMetricsStore is the subset of logstore.LogStore needed by metrics handlers. +type logMetricsStore interface { + QueryEventMetrics(ctx context.Context, req logstore.MetricsRequest) (*logstore.EventMetricsResponse, error) + QueryAttemptMetrics(ctx context.Context, req logstore.MetricsRequest) (*logstore.AttemptMetricsResponse, error) +} + +type MetricsHandlers struct { + logger *logging.Logger + metricsStore logMetricsStore +} + +func NewMetricsHandlers(logger *logging.Logger, metricsStore logMetricsStore) *MetricsHandlers { + return &MetricsHandlers{ + logger: logger, + metricsStore: metricsStore, + } +} + +// --- Allowlists --- + +type stringSet map[string]struct{} + +func newStringSet(vals ...string) stringSet { + s := make(stringSet, len(vals)) + for _, v := range vals { + s[v] = struct{}{} + } + return s +} + +func (s stringSet) contains(v string) bool { + _, ok := s[v] + return ok +} + +var ( + eventMeasures = newStringSet("count", "rate") + eventDimensions = newStringSet("tenant_id", "topic", "destination_id") + eventFilters = newStringSet("tenant_id", "topic", "destination_id") + + attemptMeasures = newStringSet("count", "successful_count", "failed_count", "error_rate", "first_attempt_count", "retry_count", "manual_retry_count", "avg_attempt_number", "rate", "successful_rate", "failed_rate") + attemptDimensions = newStringSet("tenant_id", "destination_id", "topic", "status", "code", "manual", "attempt_number") + attemptFilters = newStringSet("tenant_id", "destination_id", "topic", "status", "code", "manual", "attempt_number") +) + +// --- API response types --- + +type APIMetricsDataPoint struct { + TimeBucket *time.Time `json:"time_bucket"` + Dimensions map[string]any `json:"dimensions"` + Metrics map[string]any `json:"metrics"` +} + +type APIMetricsResponse struct { + Data []APIMetricsDataPoint `json:"data"` + Metadata APIMetricsMetadata `json:"metadata"` +} + +type APIMetricsMetadata struct { + Granularity *string `json:"granularity"` + QueryTimeMs int64 `json:"query_time_ms"` + RowCount int `json:"row_count"` + RowLimit int `json:"row_limit"` + Truncated bool `json:"truncated"` +} + +// --- Query param parsing --- + +var granularityRegex = regexp.MustCompile(`^(\d+)([smhdwM])$`) + +// granularityMaxValues defines the maximum allowed value for each granularity +// unit, per the metrics API spec. +var granularityMaxValues = map[string]int{ + "s": 60, + "m": 60, + "h": 24, + "d": 31, + "w": 4, + "M": 12, +} + +func parseGranularity(raw string) (*logstore.Granularity, error) { + if raw == "" { + return nil, nil + } + m := granularityRegex.FindStringSubmatch(raw) + if m == nil { + return nil, fmt.Errorf("invalid granularity %q: must match where unit is one of s,m,h,d,w,M", raw) + } + val, _ := strconv.Atoi(m[1]) + if val <= 0 { + return nil, fmt.Errorf("invalid granularity %q: value must be > 0", raw) + } + unit := m[2] + if maxVal, ok := granularityMaxValues[unit]; ok && val > maxVal { + return nil, fmt.Errorf("invalid granularity %q: %s value must be between 1 and %d", raw, unit, maxVal) + } + return &logstore.Granularity{Value: val, Unit: unit}, nil +} + +func parseMetricsRequest(c *gin.Context, allowedMeasures, allowedDimensions, allowedFilters stringSet) (*logstore.MetricsRequest, error) { + // time[start] and time[end] are required + startStr := c.Query("time[start]") + endStr := c.Query("time[end]") + if startStr == "" || endStr == "" { + return nil, fmt.Errorf("time[start] and time[end] are required") + } + + start, err := time.Parse(time.RFC3339, startStr) + if err != nil { + return nil, fmt.Errorf("invalid time[start]: %w", err) + } + end, err := time.Parse(time.RFC3339, endStr) + if err != nil { + return nil, fmt.Errorf("invalid time[end]: %w", err) + } + + // granularity (optional) + gran, err := parseGranularity(c.Query("granularity")) + if err != nil { + return nil, err + } + + // measures[] (required) + measures := ParseArrayQueryParam(c, "measures") + if len(measures) == 0 { + return nil, fmt.Errorf("at least one measures[] is required") + } + for _, m := range measures { + if !allowedMeasures.contains(m) { + return nil, fmt.Errorf("unknown measure %q", m) + } + } + + // dimensions[] (optional) + dimensions := ParseArrayQueryParam(c, "dimensions") + for _, d := range dimensions { + if !allowedDimensions.contains(d) { + return nil, fmt.Errorf("unknown dimension %q", d) + } + } + + // filters[key]=val + filters := make(map[string][]string) + for key := range allowedFilters { + vals := ParseArrayQueryParam(c, "filters["+key+"]") + if len(vals) > 0 { + filters[key] = vals + } + } + + return &logstore.MetricsRequest{ + TimeRange: logstore.TimeRange{ + Start: start, + End: end, + }, + Granularity: gran, + Measures: measures, + Dimensions: dimensions, + Filters: filters, + }, nil +} + +// isJWTCaller returns true when the request was authenticated via JWT (tenant role). +func isJWTCaller(c *gin.Context) bool { + return mustRoleFromContext(c) == RoleTenant +} + +// --- Handlers --- + +// MetricsEvents handles GET /metrics/events +func (h *MetricsHandlers) MetricsEvents(c *gin.Context) { + // JWT callers cannot use tenant_id as dimension + if isJWTCaller(c) { + if rejectTenantIDDimension(c) { + return + } + } + + req, err := parseMetricsRequest(c, eventMeasures, eventDimensions, eventFilters) + if err != nil { + AbortWithError(c, http.StatusBadRequest, NewErrBadRequest(err)) + return + } + + // JWT callers: validate/inject tenant_id filter + if isJWTCaller(c) { + if enforceJWTTenantFilter(c, req) { + return + } + } + + resp, err := h.metricsStore.QueryEventMetrics(c.Request.Context(), *req) + if err != nil { + abortWithMetricsError(c, err) + return + } + + apiData := make([]APIMetricsDataPoint, len(resp.Data)) + for i, dp := range resp.Data { + apiData[i] = eventDataPointToAPI(dp, req.Measures, req.Dimensions) + } + + c.JSON(http.StatusOK, buildAPIMetricsResponse(apiData, resp.Metadata, req.Granularity)) +} + +// MetricsAttempts handles GET /metrics/attempts +func (h *MetricsHandlers) MetricsAttempts(c *gin.Context) { + // JWT callers cannot use tenant_id as dimension + if isJWTCaller(c) { + if rejectTenantIDDimension(c) { + return + } + } + + req, err := parseMetricsRequest(c, attemptMeasures, attemptDimensions, attemptFilters) + if err != nil { + AbortWithError(c, http.StatusBadRequest, NewErrBadRequest(err)) + return + } + + // JWT callers: validate/inject tenant_id filter + if isJWTCaller(c) { + if enforceJWTTenantFilter(c, req) { + return + } + } + + resp, err := h.metricsStore.QueryAttemptMetrics(c.Request.Context(), *req) + if err != nil { + abortWithMetricsError(c, err) + return + } + + apiData := make([]APIMetricsDataPoint, len(resp.Data)) + for i, dp := range resp.Data { + apiData[i] = attemptDataPointToAPI(dp, req.Measures, req.Dimensions) + } + + c.JSON(http.StatusOK, buildAPIMetricsResponse(apiData, resp.Metadata, req.Granularity)) +} + +// rejectTenantIDDimension aborts with 403 if the request includes tenant_id as a dimension. +// Returns true if the request was aborted. +func rejectTenantIDDimension(c *gin.Context) bool { + for _, d := range ParseArrayQueryParam(c, "dimensions") { + if d == "tenant_id" { + AbortWithError(c, http.StatusForbidden, ErrorResponse{ + Code: http.StatusForbidden, + Message: "tenant_id dimension is not allowed for tenant-scoped requests", + }) + return true + } + } + return false +} + +// enforceJWTTenantFilter validates filters[tenant_id] for JWT callers. +// If absent, injects the JWT tenant. If present but mismatched, aborts 403. +// Returns true if the request was aborted. +func enforceJWTTenantFilter(c *gin.Context, req *logstore.MetricsRequest) bool { + jwtTenantID := tenantIDFromContext(c) + if filterTenants, ok := req.Filters["tenant_id"]; ok { + if len(filterTenants) != 1 || filterTenants[0] != jwtTenantID { + AbortWithError(c, http.StatusForbidden, ErrorResponse{ + Code: http.StatusForbidden, + Message: "filters[tenant_id] does not match authenticated tenant", + }) + return true + } + } else { + if req.Filters == nil { + req.Filters = make(map[string][]string) + } + req.Filters["tenant_id"] = []string{jwtTenantID} + } + return false +} + +// abortWithMetricsError returns 400 for resource-limit and validation errors, 500 otherwise. +func abortWithMetricsError(c *gin.Context, err error) { + if errors.Is(err, driver.ErrInvalidTimeRange) { + AbortWithError(c, http.StatusBadRequest, NewErrBadRequest(err)) + return + } + if errors.Is(err, driver.ErrResourceLimit) { + AbortWithError(c, http.StatusBadRequest, ErrorResponse{ + Code: http.StatusBadRequest, + Message: "query too broad: try fewer dimensions, more filters, or a shorter time range", + }) + return + } + AbortWithError(c, http.StatusInternalServerError, NewErrInternalServer(err)) +} + +// --- Response transformation --- + +func buildAPIMetricsResponse(data []APIMetricsDataPoint, meta logstore.MetricsMetadata, reqGranularity *logstore.Granularity) APIMetricsResponse { + var gran *string + if reqGranularity != nil { + s := fmt.Sprintf("%d%s", reqGranularity.Value, reqGranularity.Unit) + gran = &s + } + return APIMetricsResponse{ + Data: data, + Metadata: APIMetricsMetadata{ + Granularity: gran, + QueryTimeMs: meta.QueryTimeMs, + RowCount: meta.RowCount, + RowLimit: meta.RowLimit, + Truncated: meta.Truncated, + }, + } +} + +func eventDataPointToAPI(dp logstore.EventMetricsDataPoint, measures, dimensions []string) APIMetricsDataPoint { + metrics := make(map[string]any, len(measures)) + for _, m := range measures { + switch m { + case "count": + metrics["count"] = derefInt(dp.Count) + case "rate": + metrics["rate"] = derefFloat64(dp.Rate) + } + } + + dims := make(map[string]any, len(dimensions)) + for _, d := range dimensions { + switch d { + case "tenant_id": + dims["tenant_id"] = derefString(dp.TenantID) + case "topic": + dims["topic"] = derefString(dp.Topic) + case "destination_id": + dims["destination_id"] = derefString(dp.DestinationID) + } + } + + return APIMetricsDataPoint{ + TimeBucket: dp.TimeBucket, + Dimensions: dims, + Metrics: metrics, + } +} + +func attemptDataPointToAPI(dp logstore.AttemptMetricsDataPoint, measures, dimensions []string) APIMetricsDataPoint { + metrics := make(map[string]any, len(measures)) + for _, m := range measures { + switch m { + case "count": + metrics["count"] = derefInt(dp.Count) + case "successful_count": + metrics["successful_count"] = derefInt(dp.SuccessfulCount) + case "failed_count": + metrics["failed_count"] = derefInt(dp.FailedCount) + case "error_rate": + metrics["error_rate"] = derefFloat64(dp.ErrorRate) + case "first_attempt_count": + metrics["first_attempt_count"] = derefInt(dp.FirstAttemptCount) + case "retry_count": + metrics["retry_count"] = derefInt(dp.RetryCount) + case "manual_retry_count": + metrics["manual_retry_count"] = derefInt(dp.ManualRetryCount) + case "avg_attempt_number": + metrics["avg_attempt_number"] = derefFloat64(dp.AvgAttemptNumber) + case "rate": + metrics["rate"] = derefFloat64(dp.Rate) + case "successful_rate": + metrics["successful_rate"] = derefFloat64(dp.SuccessfulRate) + case "failed_rate": + metrics["failed_rate"] = derefFloat64(dp.FailedRate) + } + } + + dims := make(map[string]any, len(dimensions)) + for _, d := range dimensions { + switch d { + case "tenant_id": + dims["tenant_id"] = derefString(dp.TenantID) + case "destination_id": + dims["destination_id"] = derefString(dp.DestinationID) + case "topic": + dims["topic"] = derefString(dp.Topic) + case "status": + dims["status"] = derefString(dp.Status) + case "code": + dims["code"] = derefString(dp.Code) + case "manual": + dims["manual"] = derefBool(dp.Manual) + case "attempt_number": + dims["attempt_number"] = derefInt(dp.AttemptNumber) + } + } + + return APIMetricsDataPoint{ + TimeBucket: dp.TimeBucket, + Dimensions: dims, + Metrics: metrics, + } +} + +// --- Deref helpers --- + +func derefInt(p *int) int { + if p != nil { + return *p + } + return 0 +} + +func derefFloat64(p *float64) float64 { + if p != nil { + return *p + } + return 0 +} + +func derefString(p *string) string { + if p != nil { + return *p + } + return "" +} + +func derefBool(p *bool) bool { + if p != nil { + return *p + } + return false +} diff --git a/internal/apirouter/metrics_handlers_test.go b/internal/apirouter/metrics_handlers_test.go new file mode 100644 index 000000000..8f44f444d --- /dev/null +++ b/internal/apirouter/metrics_handlers_test.go @@ -0,0 +1,607 @@ +package apirouter_test + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/hookdeck/outpost/internal/apirouter" + "github.com/hookdeck/outpost/internal/models" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAPI_MetricsEvents(t *testing.T) { + baseStart := time.Now().Add(-1 * time.Hour).UTC().Truncate(time.Second) + baseEnd := time.Now().UTC().Truncate(time.Second) + baseQS := "time[start]=" + baseStart.Format(time.RFC3339) + + "&time[end]=" + baseEnd.Format(time.RFC3339) + + t.Run("happy path with granularity", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1"), ef.WithTopic("user.created")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&granularity=1h", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + assert.NotNil(t, result.Metadata) + assert.NotNil(t, result.Metadata.Granularity) + assert.Equal(t, "1h", *result.Metadata.Granularity) + }) + + t.Run("happy path with dimensions", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1"), ef.WithTopic("user.created")) + e2 := ef.AnyPointer(ef.WithTenantID("t1"), ef.WithTopic("user.updated")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&dimensions[0]=topic", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + // Each data point should have a "topic" dimension and "count" metric + for _, dp := range result.Data { + assert.Contains(t, dp.Dimensions, "topic") + assert.Contains(t, dp.Metrics, "count") + } + }) + + t.Run("no granularity returns aggregate", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + assert.Nil(t, result.Metadata.Granularity) + // Should have data (aggregate) + if len(result.Data) > 0 { + assert.Nil(t, result.Data[0].TimeBucket) + } + }) + + t.Run("tenant isolation with JWT", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + e2 := ef.AnyPointer(ef.WithTenantID("t2")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count", nil) + resp := h.do(h.withJWT(req, "t1")) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + // Should only contain t1's data + if len(result.Data) > 0 { + count, ok := result.Data[0].Metrics["count"] + assert.True(t, ok) + // count should reflect only t1's event + assert.Equal(t, float64(1), count) + } + }) + + t.Run("admin can use tenant_id dimension", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + e2 := ef.AnyPointer(ef.WithTenantID("t2")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&dimensions[0]=tenant_id", nil) + resp := h.do(h.withAPIKey(req)) + + // Admin should be allowed to use tenant_id dimension (not rejected like JWT) + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + for _, dp := range result.Data { + assert.Contains(t, dp.Dimensions, "tenant_id") + } + }) + + t.Run("JWT rejected for tenant_id dimension", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&dimensions[0]=tenant_id", nil) + resp := h.do(h.withJWT(req, "t1")) + + assert.Equal(t, http.StatusForbidden, resp.Code) + }) + + t.Run("JWT allowed with matching tenant_id filter", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&filters[tenant_id][0]=t1", nil) + resp := h.do(h.withJWT(req, "t1")) + + assert.Equal(t, http.StatusOK, resp.Code) + }) + + t.Run("JWT rejected for mismatched tenant_id filter", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&filters[tenant_id][0]=t2", nil) + resp := h.do(h.withJWT(req, "t1")) + + assert.Equal(t, http.StatusForbidden, resp.Code) + }) + + t.Run("API key with tenant_id filter", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + e2 := ef.AnyPointer(ef.WithTenantID("t2")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&filters[tenant_id][0]=t1", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + count, ok := result.Data[0].Metrics["count"] + assert.True(t, ok) + assert.Equal(t, float64(1), count) + } + }) + + t.Run("missing time returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("invalid granularity returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&granularity=invalid", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("unknown measure returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=nonexistent", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("unknown dimension returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&dimensions[0]=nonexistent", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("start equals end returns 400", func(t *testing.T) { + h := newAPITest(t) + sameTime := time.Now().UTC().Truncate(time.Second) + qs := "time[start]=" + sameTime.Format(time.RFC3339) + + "&time[end]=" + sameTime.Format(time.RFC3339) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+qs+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("start after end returns 400", func(t *testing.T) { + h := newAPITest(t) + now := time.Now().UTC().Truncate(time.Second) + qs := "time[start]=" + now.Format(time.RFC3339) + + "&time[end]=" + now.Add(-1*time.Hour).Format(time.RFC3339) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+qs+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("too many buckets returns 400 query too broad", func(t *testing.T) { + h := newAPITest(t) + + // 1s granularity over 2 days = 172800 buckets > 100k limit + start := time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC) + end := time.Date(2000, 1, 3, 0, 0, 0, 0, time.UTC) + qs := "time[start]=" + start.Format(time.RFC3339) + + "&time[end]=" + end.Format(time.RFC3339) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+qs+"&measures[0]=count&granularity=1s", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + assert.Contains(t, resp.Body.String(), "query too broad") + }) + + t.Run("granularity value out of range returns 400", func(t *testing.T) { + h := newAPITest(t) + + cases := []string{"61s", "61m", "25h", "32d", "5w", "13M"} + for _, gran := range cases { + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&granularity="+gran, nil) + resp := h.do(h.withAPIKey(req)) + assert.Equal(t, http.StatusBadRequest, resp.Code, "granularity %s should be rejected", gran) + } + }) + + t.Run("missing measures returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS, nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("filter by topic", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1"), ef.WithTopic("user.created")) + e2 := ef.AnyPointer(ef.WithTenantID("t1"), ef.WithTopic("user.updated")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&filters[topic][0]=user.created", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + count, ok := result.Data[0].Metrics["count"] + assert.True(t, ok) + assert.Equal(t, float64(1), count) + } + }) + + t.Run("empty results", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + assert.Empty(t, result.Data) + }) + + t.Run("rate measure returns float", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=rate", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + rate, ok := result.Data[0].Metrics["rate"] + assert.True(t, ok) + rateVal, _ := rate.(float64) + assert.Greater(t, rateVal, 0.0) + // count should not appear since user didn't request it + _, hasCount := result.Data[0].Metrics["count"] + assert.False(t, hasCount) + } + }) + + t.Run("rate and count together", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/events?"+baseQS+"&measures[0]=count&measures[1]=rate", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + assert.Contains(t, result.Data[0].Metrics, "count") + assert.Contains(t, result.Data[0].Metrics, "rate") + } + }) +} + +func TestAPI_MetricsAttempts(t *testing.T) { + baseStart := time.Now().Add(-1 * time.Hour).UTC().Truncate(time.Second) + baseEnd := time.Now().UTC().Truncate(time.Second) + baseQS := "time[start]=" + baseStart.Format(time.RFC3339) + + "&time[end]=" + baseEnd.Format(time.RFC3339) + + t.Run("happy path with multiple measures", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + a1 := attemptForEvent(e1, af.WithStatus("successful")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: a1}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count&measures[1]=successful_count&measures[2]=error_rate", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + assert.Contains(t, result.Data[0].Metrics, "count") + assert.Contains(t, result.Data[0].Metrics, "successful_count") + assert.Contains(t, result.Data[0].Metrics, "error_rate") + } + }) + + t.Run("with granularity and dimensions", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + a1 := attemptForEvent(e1, af.WithStatus("successful")) + a2 := attemptForEvent(e1, af.WithStatus("failed")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: a1}, + {Event: e1, Attempt: a2}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count&granularity=1h&dimensions[0]=status", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + for _, dp := range result.Data { + assert.Contains(t, dp.Dimensions, "status") + assert.Contains(t, dp.Metrics, "count") + } + }) + + t.Run("tenant isolation with JWT", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + e2 := ef.AnyPointer(ef.WithTenantID("t2")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + {Event: e2, Attempt: attemptForEvent(e2)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count", nil) + resp := h.do(h.withJWT(req, "t1")) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + count, ok := result.Data[0].Metrics["count"] + assert.True(t, ok) + assert.Equal(t, float64(1), count) + } + }) + + t.Run("JWT rejected for tenant_id dimension", func(t *testing.T) { + h := newAPITest(t) + h.tenantStore.UpsertTenant(t.Context(), tf.Any(tf.WithID("t1"))) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count&dimensions[0]=tenant_id", nil) + resp := h.do(h.withJWT(req, "t1")) + + assert.Equal(t, http.StatusForbidden, resp.Code) + }) + + t.Run("filter by status", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + a1 := attemptForEvent(e1, af.WithStatus("successful")) + a2 := attemptForEvent(e1, af.WithStatus("failed")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: a1}, + {Event: e1, Attempt: a2}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count&filters[status][0]=successful", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + count, ok := result.Data[0].Metrics["count"] + assert.True(t, ok) + assert.Equal(t, float64(1), count) + } + }) + + t.Run("start equals end returns 400", func(t *testing.T) { + h := newAPITest(t) + sameTime := time.Now().UTC().Truncate(time.Second) + qs := "time[start]=" + sameTime.Format(time.RFC3339) + + "&time[end]=" + sameTime.Format(time.RFC3339) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+qs+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("unknown attempt measure returns 400", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=nonexistent", nil) + resp := h.do(h.withAPIKey(req)) + + assert.Equal(t, http.StatusBadRequest, resp.Code) + }) + + t.Run("empty results", func(t *testing.T) { + h := newAPITest(t) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + assert.Empty(t, result.Data) + }) + + t.Run("rate measures without counts", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + a1 := attemptForEvent(e1, af.WithStatus("successful")) + a2 := attemptForEvent(e1, af.WithStatus("failed")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: a1}, + {Event: e1, Attempt: a2}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=rate&measures[1]=successful_rate&measures[2]=failed_rate", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + if len(result.Data) > 0 { + dp := result.Data[0] + assert.Contains(t, dp.Metrics, "rate") + assert.Contains(t, dp.Metrics, "successful_rate") + assert.Contains(t, dp.Metrics, "failed_rate") + // Injected dependencies should not appear + assert.NotContains(t, dp.Metrics, "count") + assert.NotContains(t, dp.Metrics, "successful_count") + assert.NotContains(t, dp.Metrics, "failed_count") + } + }) + + t.Run("rate with granularity", func(t *testing.T) { + h := newAPITest(t) + + e1 := ef.AnyPointer(ef.WithTenantID("t1")) + require.NoError(t, h.logStore.InsertMany(t.Context(), []*models.LogEntry{ + {Event: e1, Attempt: attemptForEvent(e1)}, + })) + + req := httptest.NewRequest(http.MethodGet, + "/api/v1/metrics/attempts?"+baseQS+"&measures[0]=count&measures[1]=rate&granularity=1h", nil) + resp := h.do(h.withAPIKey(req)) + + require.Equal(t, http.StatusOK, resp.Code) + + var result apirouter.APIMetricsResponse + require.NoError(t, json.Unmarshal(resp.Body.Bytes(), &result)) + for _, dp := range result.Data { + assert.Contains(t, dp.Metrics, "count") + assert.Contains(t, dp.Metrics, "rate") + } + }) +} diff --git a/internal/apirouter/router.go b/internal/apirouter/router.go index e2e61e1e9..0e9d3e4ea 100644 --- a/internal/apirouter/router.go +++ b/internal/apirouter/router.go @@ -142,6 +142,7 @@ func NewRouter(cfg RouterConfig, deps RouterDeps) http.Handler { logHandlers := NewLogHandlers(deps.Logger, deps.LogStore) retryHandlers := NewRetryHandlers(deps.Logger, deps.TenantStore, deps.LogStore, deps.DeliveryPublisher) topicHandlers := NewTopicHandlers(deps.Logger, cfg.Topics) + metricsHandlers := NewMetricsHandlers(deps.Logger, deps.LogStore) routes := []RouteDefinition{ // Schemas & Topics @@ -179,6 +180,10 @@ func NewRouter(cfg RouterConfig, deps RouterDeps) http.Handler { // Attempts {Method: http.MethodGet, Path: "/attempts", Handler: logHandlers.ListAttempts}, {Method: http.MethodGet, Path: "/attempts/:attempt_id", Handler: logHandlers.RetrieveAttempt}, + + // Metrics + {Method: http.MethodGet, Path: "/metrics/events", Handler: metricsHandlers.MetricsEvents}, + {Method: http.MethodGet, Path: "/metrics/attempts", Handler: metricsHandlers.MetricsAttempts}, } registerRoutes(apiRouter, cfg, deps.TenantStore, routes) diff --git a/internal/logstore/bucket/bucket.go b/internal/logstore/bucket/bucket.go new file mode 100644 index 000000000..4b6dd4c7a --- /dev/null +++ b/internal/logstore/bucket/bucket.go @@ -0,0 +1,110 @@ +package bucket + +import ( + "fmt" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +const maxBuckets = 100000 + +// ErrTooManyBuckets is returned when the granularity + time range would +// produce more than maxBuckets time slots, which could cause OOM. +var ErrTooManyBuckets = fmt.Errorf("time range produces more than %d buckets", maxBuckets) + +// epochDay is the anchor for epoch-based day/week alignment (1970-01-01 UTC). +var epochDay = time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC) + +// epochSunday is the first Sunday on or after the Unix epoch (1970-01-04 UTC), +// used as the anchor for Sunday-based week alignment. +var epochSunday = time.Date(1970, 1, 4, 0, 0, 0, 0, time.UTC) + +// TruncateTime truncates t to the boundary defined by granularity g. +// This is the shared implementation used by all backends. +// +// For sub-day units (s, m, h), Value controls both step size and alignment. +// For calendar units (d, w, M), Value > 1 uses epoch-anchored alignment so +// that multi-day/week/month intervals aggregate data correctly. +func TruncateTime(t time.Time, g *driver.Granularity) time.Time { + t = t.UTC() + switch g.Unit { + case "s": + d := time.Duration(g.Value) * time.Second + return t.Truncate(d) + case "m": + d := time.Duration(g.Value) * time.Minute + return t.Truncate(d) + case "h": + d := time.Duration(g.Value) * time.Hour + return t.Truncate(d) + case "d": + dayStart := time.Date(t.Year(), t.Month(), t.Day(), 0, 0, 0, 0, time.UTC) + if g.Value == 1 { + return dayStart + } + days := int(dayStart.Sub(epochDay).Hours() / 24) + aligned := (days / g.Value) * g.Value + return epochDay.AddDate(0, 0, aligned) + case "w": + weekday := int(t.Weekday()) + sunday := time.Date(t.Year(), t.Month(), t.Day()-weekday, 0, 0, 0, 0, time.UTC) + if g.Value == 1 { + return sunday + } + weeks := int(sunday.Sub(epochSunday).Hours() / (7 * 24)) + aligned := (weeks / g.Value) * g.Value + return epochSunday.AddDate(0, 0, aligned*7) + case "M": + if g.Value == 1 { + return time.Date(t.Year(), t.Month(), 1, 0, 0, 0, 0, time.UTC) + } + totalMonths := (t.Year()-1970)*12 + int(t.Month()-1) + aligned := (totalMonths / g.Value) * g.Value + y := 1970 + aligned/12 + m := time.Month(aligned%12 + 1) + return time.Date(y, m, 1, 0, 0, 0, 0, time.UTC) + default: + return t + } +} + +// AdvanceTime steps forward by one granularity unit from t. +func AdvanceTime(t time.Time, g *driver.Granularity) time.Time { + switch g.Unit { + case "s": + return t.Add(time.Duration(g.Value) * time.Second) + case "m": + return t.Add(time.Duration(g.Value) * time.Minute) + case "h": + return t.Add(time.Duration(g.Value) * time.Hour) + case "d": + return t.AddDate(0, 0, g.Value) + case "w": + return t.AddDate(0, 0, 7*g.Value) + case "M": + return t.AddDate(0, g.Value, 0) + default: + return t + } +} + +// GenerateTimeBuckets returns a slice of aligned time slots from the truncated +// start up to (but not including) end, stepping by one granularity unit. +// Returns ErrTooManyBuckets if the result would exceed maxBuckets. +func GenerateTimeBuckets(start, end time.Time, g *driver.Granularity) ([]time.Time, error) { + cur := TruncateTime(start, g) + var buckets []time.Time + for cur.Before(end) { + if len(buckets) >= maxBuckets { + return nil, ErrTooManyBuckets + } + buckets = append(buckets, cur) + next := AdvanceTime(cur, g) + if !next.After(cur) { + break // safety: prevent infinite loop if time doesn't advance + } + cur = next + } + return buckets, nil +} diff --git a/internal/logstore/bucket/dimkey.go b/internal/logstore/bucket/dimkey.go new file mode 100644 index 000000000..23c49dcd2 --- /dev/null +++ b/internal/logstore/bucket/dimkey.go @@ -0,0 +1,72 @@ +package bucket + +import ( + "fmt" + "strings" + + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +// DimKey is an opaque composite key built from dimension values. +type DimKey string + +// EventDimKey builds a composite key from the dimension fields of an event data point. +func EventDimKey(dp *driver.EventMetricsDataPoint, dims []string) DimKey { + parts := make([]string, len(dims)) + for i, dim := range dims { + switch dim { + case "tenant_id": + parts[i] = derefStr(dp.TenantID) + case "topic": + parts[i] = derefStr(dp.Topic) + case "destination_id": + parts[i] = derefStr(dp.DestinationID) + } + } + return DimKey(strings.Join(parts, "\x00")) +} + +// AttemptDimKey builds a composite key from the dimension fields of an attempt data point. +func AttemptDimKey(dp *driver.AttemptMetricsDataPoint, dims []string) DimKey { + parts := make([]string, len(dims)) + for i, dim := range dims { + switch dim { + case "tenant_id": + parts[i] = derefStr(dp.TenantID) + case "destination_id": + parts[i] = derefStr(dp.DestinationID) + case "topic": + parts[i] = derefStr(dp.Topic) + case "status": + parts[i] = derefStr(dp.Status) + case "code": + parts[i] = derefStr(dp.Code) + case "manual": + parts[i] = derefBool(dp.Manual) + case "attempt_number": + parts[i] = derefInt(dp.AttemptNumber) + } + } + return DimKey(strings.Join(parts, "\x00")) +} + +func derefStr(p *string) string { + if p == nil { + return "" + } + return *p +} + +func derefBool(p *bool) string { + if p == nil { + return "" + } + return fmt.Sprintf("%t", *p) +} + +func derefInt(p *int) string { + if p == nil { + return "" + } + return fmt.Sprintf("%d", *p) +} diff --git a/internal/logstore/bucket/fill.go b/internal/logstore/bucket/fill.go new file mode 100644 index 000000000..ca364b36c --- /dev/null +++ b/internal/logstore/bucket/fill.go @@ -0,0 +1,274 @@ +package bucket + +import ( + "sort" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +// FillEventBuckets fills in missing time buckets with zero-valued data points +// so that the response contains one entry per time slot (per dimension combo). +// Returns ErrTooManyBuckets if the time range + granularity produces too many slots. +func FillEventBuckets(data []driver.EventMetricsDataPoint, req driver.MetricsRequest) ([]driver.EventMetricsDataPoint, error) { + if req.Granularity == nil { + return data, nil + } + + slots, err := GenerateTimeBuckets(req.TimeRange.Start, req.TimeRange.End, req.Granularity) + if err != nil { + return nil, err + } + if len(slots) == 0 { + return data, nil + } + + if len(req.Dimensions) == 0 { + return fillEventNoDims(data, slots, req), nil + } + return fillEventWithDims(data, slots, req), nil +} + +// FillAttemptBuckets fills in missing time buckets with zero-valued data points. +// Returns ErrTooManyBuckets if the time range + granularity produces too many slots. +func FillAttemptBuckets(data []driver.AttemptMetricsDataPoint, req driver.MetricsRequest) ([]driver.AttemptMetricsDataPoint, error) { + if req.Granularity == nil { + return data, nil + } + + slots, err := GenerateTimeBuckets(req.TimeRange.Start, req.TimeRange.End, req.Granularity) + if err != nil { + return nil, err + } + if len(slots) == 0 { + return data, nil + } + + if len(req.Dimensions) == 0 { + return fillAttemptNoDims(data, slots, req), nil + } + return fillAttemptWithDims(data, slots, req), nil +} + +// ── Event filling ───────────────────────────────────────────────────────── + +func fillEventNoDims(data []driver.EventMetricsDataPoint, slots []time.Time, req driver.MetricsRequest) []driver.EventMetricsDataPoint { + index := map[time.Time]driver.EventMetricsDataPoint{} + for _, dp := range data { + if dp.TimeBucket != nil { + index[*dp.TimeBucket] = dp + } + } + + result := make([]driver.EventMetricsDataPoint, 0, len(slots)) + for _, slot := range slots { + if dp, ok := index[slot]; ok { + result = append(result, dp) + } else { + result = append(result, zeroEventDP(slot, req.Measures)) + } + } + return result +} + +func fillEventWithDims(data []driver.EventMetricsDataPoint, slots []time.Time, req driver.MetricsRequest) []driver.EventMetricsDataPoint { + type key struct { + dim DimKey + slot time.Time + } + + // Collect unique dimension combos and a template for each. + templates := map[DimKey]driver.EventMetricsDataPoint{} + dimOrder := []DimKey{} + index := map[key]driver.EventMetricsDataPoint{} + + for _, dp := range data { + dk := EventDimKey(&dp, req.Dimensions) + if _, exists := templates[dk]; !exists { + templates[dk] = dp + dimOrder = append(dimOrder, dk) + } + if dp.TimeBucket != nil { + index[key{dk, *dp.TimeBucket}] = dp + } + } + + sort.Slice(dimOrder, func(i, j int) bool { + return string(dimOrder[i]) < string(dimOrder[j]) + }) + + result := make([]driver.EventMetricsDataPoint, 0, len(dimOrder)*len(slots)) + for _, dk := range dimOrder { + tmpl := templates[dk] + for _, slot := range slots { + if dp, ok := index[key{dk, slot}]; ok { + result = append(result, dp) + } else { + dp := zeroEventDP(slot, req.Measures) + copyEventDims(&dp, &tmpl, req.Dimensions) + result = append(result, dp) + } + } + } + return result +} + +func zeroEventDP(slot time.Time, measures []string) driver.EventMetricsDataPoint { + dp := driver.EventMetricsDataPoint{TimeBucket: new(slot)} + for _, m := range measures { + switch m { + case "count": + dp.Count = new(0) + case "rate": + dp.Rate = new(0.0) + } + } + return dp +} + +func copyEventDims(dst, src *driver.EventMetricsDataPoint, dims []string) { + for _, dim := range dims { + switch dim { + case "tenant_id": + if src.TenantID != nil { + dst.TenantID = new(*src.TenantID) + } + case "topic": + if src.Topic != nil { + dst.Topic = new(*src.Topic) + } + case "destination_id": + if src.DestinationID != nil { + dst.DestinationID = new(*src.DestinationID) + } + } + } +} + +// ── Attempt filling ─────────────────────────────────────────────────────── + +func fillAttemptNoDims(data []driver.AttemptMetricsDataPoint, slots []time.Time, req driver.MetricsRequest) []driver.AttemptMetricsDataPoint { + index := map[time.Time]driver.AttemptMetricsDataPoint{} + for _, dp := range data { + if dp.TimeBucket != nil { + index[*dp.TimeBucket] = dp + } + } + + result := make([]driver.AttemptMetricsDataPoint, 0, len(slots)) + for _, slot := range slots { + if dp, ok := index[slot]; ok { + result = append(result, dp) + } else { + result = append(result, zeroAttemptDP(slot, req.Measures)) + } + } + return result +} + +func fillAttemptWithDims(data []driver.AttemptMetricsDataPoint, slots []time.Time, req driver.MetricsRequest) []driver.AttemptMetricsDataPoint { + type key struct { + dim DimKey + slot time.Time + } + + templates := map[DimKey]driver.AttemptMetricsDataPoint{} + dimOrder := []DimKey{} + index := map[key]driver.AttemptMetricsDataPoint{} + + for _, dp := range data { + dk := AttemptDimKey(&dp, req.Dimensions) + if _, exists := templates[dk]; !exists { + templates[dk] = dp + dimOrder = append(dimOrder, dk) + } + if dp.TimeBucket != nil { + index[key{dk, *dp.TimeBucket}] = dp + } + } + + sort.Slice(dimOrder, func(i, j int) bool { + return string(dimOrder[i]) < string(dimOrder[j]) + }) + + result := make([]driver.AttemptMetricsDataPoint, 0, len(dimOrder)*len(slots)) + for _, dk := range dimOrder { + tmpl := templates[dk] + for _, slot := range slots { + if dp, ok := index[key{dk, slot}]; ok { + result = append(result, dp) + } else { + dp := zeroAttemptDP(slot, req.Measures) + copyAttemptDims(&dp, &tmpl, req.Dimensions) + result = append(result, dp) + } + } + } + return result +} + +func zeroAttemptDP(slot time.Time, measures []string) driver.AttemptMetricsDataPoint { + dp := driver.AttemptMetricsDataPoint{TimeBucket: new(slot)} + for _, m := range measures { + switch m { + case "count": + dp.Count = new(0) + case "successful_count": + dp.SuccessfulCount = new(0) + case "failed_count": + dp.FailedCount = new(0) + case "error_rate": + dp.ErrorRate = new(0.0) + case "first_attempt_count": + dp.FirstAttemptCount = new(0) + case "retry_count": + dp.RetryCount = new(0) + case "manual_retry_count": + dp.ManualRetryCount = new(0) + case "avg_attempt_number": + dp.AvgAttemptNumber = new(0.0) + case "rate": + dp.Rate = new(0.0) + case "successful_rate": + dp.SuccessfulRate = new(0.0) + case "failed_rate": + dp.FailedRate = new(0.0) + } + } + return dp +} + +func copyAttemptDims(dst, src *driver.AttemptMetricsDataPoint, dims []string) { + for _, dim := range dims { + switch dim { + case "tenant_id": + if src.TenantID != nil { + dst.TenantID = new(*src.TenantID) + } + case "destination_id": + if src.DestinationID != nil { + dst.DestinationID = new(*src.DestinationID) + } + case "topic": + if src.Topic != nil { + dst.Topic = new(*src.Topic) + } + case "status": + if src.Status != nil { + dst.Status = new(*src.Status) + } + case "code": + if src.Code != nil { + dst.Code = new(*src.Code) + } + case "manual": + if src.Manual != nil { + dst.Manual = new(*src.Manual) + } + case "attempt_number": + if src.AttemptNumber != nil { + dst.AttemptNumber = new(*src.AttemptNumber) + } + } + } +} diff --git a/internal/logstore/chlogstore/metrics.go b/internal/logstore/chlogstore/metrics.go new file mode 100644 index 000000000..57a19c1ed --- /dev/null +++ b/internal/logstore/chlogstore/metrics.go @@ -0,0 +1,571 @@ +package chlogstore + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/hookdeck/outpost/internal/logstore/bucket" + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +const metricsSettings = " SETTINGS max_execution_time = 30, max_rows_to_group_by = 5000000, group_by_overflow_mode = 'throw'" + +const ( + defaultRowLimit = 100000 + metricsQueryTimeout = 30 * time.Second +) + +func metricsCtx(ctx context.Context) (context.Context, context.CancelFunc) { + if _, ok := ctx.Deadline(); ok { + return ctx, func() {} + } + return context.WithTimeout(ctx, metricsQueryTimeout) +} + +// chTimeBucketExpr returns a ClickHouse expression that truncates a DateTime64 +// column to the given granularity, matching the Go truncation semantics used +// in the in-memory driver. +func chTimeBucketExpr(col string, g *driver.Granularity) string { + switch g.Unit { + case "s": + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d SECOND)", col, g.Value) + case "m": + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d MINUTE)", col, g.Value) + case "h": + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d HOUR)", col, g.Value) + case "d": + if g.Value == 1 { + return fmt.Sprintf("toStartOfDay(%s)", col) + } + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d DAY)", col, g.Value) + case "w": + if g.Value == 1 { + // mode 0 = Sunday-based weeks, matching Go's time.Weekday convention. + return fmt.Sprintf("toStartOfWeek(%s, 0)", col) + } + // Multi-week: use N*7 day intervals anchored to 1970-01-04 (Sunday). + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d DAY, toDateTime('1970-01-04'))", col, g.Value*7) + case "M": + if g.Value == 1 { + return fmt.Sprintf("toStartOfMonth(%s)", col) + } + return fmt.Sprintf("toStartOfInterval(%s, INTERVAL %d MONTH)", col, g.Value) + default: + return col + } +} + +// addInFilter appends an IN condition with individual ? placeholders. +func addInFilter(conditions []string, args []any, col string, vals []string) ([]string, []any) { + if len(vals) == 0 { + return conditions, args + } + conditions = append(conditions, col+" IN ?") + args = append(args, vals) + return conditions, args +} + +// ── Event Metrics ───────────────────────────────────────────────────────── + +func (s *logStoreImpl) QueryEventMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.EventMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + ctx, cancel := metricsCtx(ctx) + defer cancel() + + start := time.Now() + + var ( + selectExprs []string + groupExprs []string + conditions []string + args []any + ) + + type sf int + const ( + sfTimeBucket sf = iota + sfTenantID + sfTopic + sfDestID + sfCount + ) + var order []sf + + // Time bucket + if req.Granularity != nil { + expr := chTimeBucketExpr("event_time", req.Granularity) + selectExprs = append(selectExprs, expr+" AS time_bucket") + groupExprs = append(groupExprs, expr) + order = append(order, sfTimeBucket) + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + selectExprs = append(selectExprs, "tenant_id") + groupExprs = append(groupExprs, "tenant_id") + order = append(order, sfTenantID) + case "topic": + selectExprs = append(selectExprs, "topic") + groupExprs = append(groupExprs, "topic") + order = append(order, sfTopic) + case "destination_id": + selectExprs = append(selectExprs, "destination_id") + groupExprs = append(groupExprs, "destination_id") + order = append(order, sfDestID) + } + } + + // Measures — use uniqExact(event_id) instead of count() to handle + // ReplacingMergeTree duplicates from unmerged parts without FINAL. + for _, measure := range req.Measures { + switch measure { + case "count": + selectExprs = append(selectExprs, "uniqExact(event_id)") + order = append(order, sfCount) + } + } + + // WHERE + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + conditions, args = addInFilter(conditions, args, "tenant_id", tenantIDs) + } + conditions = append(conditions, "event_time >= ?") + args = append(args, req.TimeRange.Start) + conditions = append(conditions, "event_time < ?") + args = append(args, req.TimeRange.End) + + if topics, ok := req.Filters["topic"]; ok { + conditions, args = addInFilter(conditions, args, "topic", topics) + } + if dests, ok := req.Filters["destination_id"]; ok { + conditions, args = addInFilter(conditions, args, "destination_id", dests) + } + + // Build SQL — no FINAL needed; uniqExact(event_id) handles dedup from + // unmerged ReplacingMergeTree parts. + query := fmt.Sprintf("SELECT %s FROM %s WHERE %s", + strings.Join(selectExprs, ", "), + s.eventsTable, + strings.Join(conditions, " AND ")) + if len(groupExprs) > 0 { + query += " GROUP BY " + strings.Join(groupExprs, ", ") + } + query += " HAVING count() > 0" + if len(groupExprs) > 0 { + query += " ORDER BY " + strings.Join(groupExprs, ", ") + } + query += fmt.Sprintf(" LIMIT %d", defaultRowLimit+1) + query += metricsSettings + + rows, err := s.chDB.Query(ctx, query, args...) + if err != nil { + return nil, wrapCHMetricsError("query event metrics", err) + } + defer rows.Close() + + var ( + tbVal time.Time + tenantIDVal string + topicVal string + destIDVal string + countVal uint64 + ) + scanDests := make([]any, len(order)) + for i, f := range order { + switch f { + case sfTimeBucket: + scanDests[i] = &tbVal + case sfTenantID: + scanDests[i] = &tenantIDVal + case sfTopic: + scanDests[i] = &topicVal + case sfDestID: + scanDests[i] = &destIDVal + case sfCount: + scanDests[i] = &countVal + } + } + + data := []driver.EventMetricsDataPoint{} + for rows.Next() { + if err := rows.Scan(scanDests...); err != nil { + return nil, fmt.Errorf("scan event metrics: %w", err) + } + + dp := driver.EventMetricsDataPoint{} + for _, f := range order { + switch f { + case sfTimeBucket: + t := tbVal.UTC() + dp.TimeBucket = &t + case sfTenantID: + v := tenantIDVal + dp.TenantID = &v + case sfTopic: + v := topicVal + dp.Topic = &v + case sfDestID: + v := destIDVal + dp.DestinationID = &v + case sfCount: + v := int(countVal) + dp.Count = &v + } + } + data = append(data, dp) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows error: %w", err) + } + + truncated := len(data) > defaultRowLimit + if truncated { + data = data[:defaultRowLimit] + } + + data, err = bucket.FillEventBuckets(data, req) + if err != nil { + return nil, fmt.Errorf("fill event buckets: %w: %w", driver.ErrResourceLimit, err) + } + driver.ComputeEventRates(data, req) + + elapsed := time.Since(start) + return &driver.EventMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: truncated, + }, + }, nil +} + +// ── Attempt Metrics ─────────────────────────────────────────────────────── + +func (s *logStoreImpl) QueryAttemptMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.AttemptMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + ctx, cancel := metricsCtx(ctx) + defer cancel() + + start := time.Now() + + var ( + selectExprs []string + groupExprs []string + conditions []string + args []any + ) + + type sf int + const ( + sfTimeBucket sf = iota + sfTenantID + sfDestID + sfTopic + sfStatus + sfCode + sfManual + sfAttemptNumber + sfCount + sfSuccessCount + sfFailedCount + sfErrorRate + sfFirstAttempt + sfRetryCount + sfManualRetry + sfAvgAttemptNum + ) + var order []sf + + // Time bucket + if req.Granularity != nil { + expr := chTimeBucketExpr("attempt_time", req.Granularity) + selectExprs = append(selectExprs, expr+" AS time_bucket") + groupExprs = append(groupExprs, expr) + order = append(order, sfTimeBucket) + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + selectExprs = append(selectExprs, "tenant_id") + groupExprs = append(groupExprs, "tenant_id") + order = append(order, sfTenantID) + case "destination_id": + selectExprs = append(selectExprs, "destination_id") + groupExprs = append(groupExprs, "destination_id") + order = append(order, sfDestID) + case "topic": + selectExprs = append(selectExprs, "topic") + groupExprs = append(groupExprs, "topic") + order = append(order, sfTopic) + case "status": + selectExprs = append(selectExprs, "status") + groupExprs = append(groupExprs, "status") + order = append(order, sfStatus) + case "code": + selectExprs = append(selectExprs, "code") + groupExprs = append(groupExprs, "code") + order = append(order, sfCode) + case "manual": + selectExprs = append(selectExprs, "manual") + groupExprs = append(groupExprs, "manual") + order = append(order, sfManual) + case "attempt_number": + selectExprs = append(selectExprs, "attempt_number") + groupExprs = append(groupExprs, "attempt_number") + order = append(order, sfAttemptNumber) + } + } + + // Measures — use uniqExact/uniqExactIf(attempt_id, ...) instead of + // count/countIf to handle ReplacingMergeTree duplicates without FINAL. + // avg(attempt_number) is kept as-is: duplicates have identical values, + // so the average is only negligibly affected during brief merge windows. + for _, measure := range req.Measures { + switch measure { + case "count": + selectExprs = append(selectExprs, "uniqExact(attempt_id)") + order = append(order, sfCount) + case "successful_count": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, status = 'success')") + order = append(order, sfSuccessCount) + case "failed_count": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, status = 'failed')") + order = append(order, sfFailedCount) + case "error_rate": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, status = 'failed') / uniqExact(attempt_id)") + order = append(order, sfErrorRate) + case "first_attempt_count": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, attempt_number = 1 AND NOT manual)") + order = append(order, sfFirstAttempt) + case "retry_count": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, attempt_number > 1)") + order = append(order, sfRetryCount) + case "manual_retry_count": + selectExprs = append(selectExprs, "uniqExactIf(attempt_id, manual)") + order = append(order, sfManualRetry) + case "avg_attempt_number": + selectExprs = append(selectExprs, "avg(attempt_number)") + order = append(order, sfAvgAttemptNum) + } + } + + // WHERE + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + conditions, args = addInFilter(conditions, args, "tenant_id", tenantIDs) + } + conditions = append(conditions, "attempt_time >= ?") + args = append(args, req.TimeRange.Start) + conditions = append(conditions, "attempt_time < ?") + args = append(args, req.TimeRange.End) + + if statuses, ok := req.Filters["status"]; ok { + conditions, args = addInFilter(conditions, args, "status", statuses) + } + if dests, ok := req.Filters["destination_id"]; ok { + conditions, args = addInFilter(conditions, args, "destination_id", dests) + } + if topics, ok := req.Filters["topic"]; ok { + conditions, args = addInFilter(conditions, args, "topic", topics) + } + if codes, ok := req.Filters["code"]; ok { + conditions, args = addInFilter(conditions, args, "code", codes) + } + if manuals, ok := req.Filters["manual"]; ok { + conditions, args = addInFilter(conditions, args, "manual", manuals) + } + if attemptNums, ok := req.Filters["attempt_number"]; ok { + conditions, args = addInFilter(conditions, args, "attempt_number", attemptNums) + } + + // Build SQL + query := fmt.Sprintf("SELECT %s FROM %s WHERE %s", + strings.Join(selectExprs, ", "), + s.attemptsTable, + strings.Join(conditions, " AND ")) + if len(groupExprs) > 0 { + query += " GROUP BY " + strings.Join(groupExprs, ", ") + } + query += " HAVING count() > 0" + if len(groupExprs) > 0 { + query += " ORDER BY " + strings.Join(groupExprs, ", ") + } + query += fmt.Sprintf(" LIMIT %d", defaultRowLimit+1) + query += metricsSettings + + rows, err := s.chDB.Query(ctx, query, args...) + if err != nil { + return nil, wrapCHMetricsError("query attempt metrics", err) + } + defer rows.Close() + + var ( + tbVal time.Time + tenantIDVal string + destIDVal string + topicVal string + statusVal string + codeVal string + manualVal bool + attemptNumberVal uint32 + countVal uint64 + successCount uint64 + failedCount uint64 + errorRate float64 + firstAttempt uint64 + retryCount uint64 + manualRetry uint64 + avgAttemptNum float64 + ) + + scanDests := make([]any, len(order)) + for i, f := range order { + switch f { + case sfTimeBucket: + scanDests[i] = &tbVal + case sfTenantID: + scanDests[i] = &tenantIDVal + case sfDestID: + scanDests[i] = &destIDVal + case sfTopic: + scanDests[i] = &topicVal + case sfStatus: + scanDests[i] = &statusVal + case sfCode: + scanDests[i] = &codeVal + case sfManual: + scanDests[i] = &manualVal + case sfAttemptNumber: + scanDests[i] = &attemptNumberVal + case sfCount: + scanDests[i] = &countVal + case sfSuccessCount: + scanDests[i] = &successCount + case sfFailedCount: + scanDests[i] = &failedCount + case sfErrorRate: + scanDests[i] = &errorRate + case sfFirstAttempt: + scanDests[i] = &firstAttempt + case sfRetryCount: + scanDests[i] = &retryCount + case sfManualRetry: + scanDests[i] = &manualRetry + case sfAvgAttemptNum: + scanDests[i] = &avgAttemptNum + } + } + + data := []driver.AttemptMetricsDataPoint{} + for rows.Next() { + if err := rows.Scan(scanDests...); err != nil { + return nil, fmt.Errorf("scan attempt metrics: %w", err) + } + + dp := driver.AttemptMetricsDataPoint{} + for _, f := range order { + switch f { + case sfTimeBucket: + t := tbVal.UTC() + dp.TimeBucket = &t + case sfTenantID: + v := tenantIDVal + dp.TenantID = &v + case sfDestID: + v := destIDVal + dp.DestinationID = &v + case sfTopic: + v := topicVal + dp.Topic = &v + case sfStatus: + v := statusVal + dp.Status = &v + case sfCode: + v := codeVal + dp.Code = &v + case sfManual: + v := manualVal + dp.Manual = &v + case sfAttemptNumber: + v := int(attemptNumberVal) + dp.AttemptNumber = &v + case sfCount: + v := int(countVal) + dp.Count = &v + case sfSuccessCount: + v := int(successCount) + dp.SuccessfulCount = &v + case sfFailedCount: + v := int(failedCount) + dp.FailedCount = &v + case sfErrorRate: + v := errorRate + dp.ErrorRate = &v + case sfFirstAttempt: + v := int(firstAttempt) + dp.FirstAttemptCount = &v + case sfRetryCount: + v := int(retryCount) + dp.RetryCount = &v + case sfManualRetry: + v := int(manualRetry) + dp.ManualRetryCount = &v + case sfAvgAttemptNum: + v := avgAttemptNum + dp.AvgAttemptNumber = &v + } + } + data = append(data, dp) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows error: %w", err) + } + + truncated := len(data) > defaultRowLimit + if truncated { + data = data[:defaultRowLimit] + } + + data, err = bucket.FillAttemptBuckets(data, req) + if err != nil { + return nil, fmt.Errorf("fill attempt buckets: %w: %w", driver.ErrResourceLimit, err) + } + driver.ComputeAttemptRates(data, req) + + elapsed := time.Since(start) + return &driver.AttemptMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: truncated, + }, + }, nil +} + +// wrapCHMetricsError detects ClickHouse resource-limit errors (TOO_MANY_ROWS, +// TIMEOUT_EXCEEDED) and wraps them as driver.ErrResourceLimit so the handler +// can return 400 instead of 500. +func wrapCHMetricsError(op string, err error) error { + msg := err.Error() + if strings.Contains(msg, "TOO_MANY_ROWS") || + strings.Contains(msg, "TIMEOUT_EXCEEDED") || + strings.Contains(msg, "max_rows_to_group_by") { + return fmt.Errorf("%s: %w: %w", op, driver.ErrResourceLimit, err) + } + return fmt.Errorf("%s: %w", op, err) +} diff --git a/internal/logstore/driver/driver.go b/internal/logstore/driver/driver.go index c6ecc57d1..d1b530ab6 100644 --- a/internal/logstore/driver/driver.go +++ b/internal/logstore/driver/driver.go @@ -16,7 +16,7 @@ type TimeFilter struct { LT *time.Time // Less than (<) } -type LogStore interface { +type Records interface { ListEvent(context.Context, ListEventRequest) (ListEventResponse, error) ListAttempt(context.Context, ListAttemptRequest) (ListAttemptResponse, error) RetrieveEvent(ctx context.Context, request RetrieveEventRequest) (*models.Event, error) @@ -24,6 +24,12 @@ type LogStore interface { InsertMany(context.Context, []*models.LogEntry) error } +// LogStore is the combined interface that all driver implementations must satisfy. +type LogStore interface { + Records + Metrics +} + type ListEventRequest struct { Next string Prev string diff --git a/internal/logstore/driver/metrics.go b/internal/logstore/driver/metrics.go new file mode 100644 index 000000000..1750f611a --- /dev/null +++ b/internal/logstore/driver/metrics.go @@ -0,0 +1,117 @@ +package driver + +import ( + "context" + "errors" + "time" +) + +// ErrResourceLimit is returned when a metrics query exceeds server-side +// resource limits (e.g. too many GROUP BY rows, query timeout). Callers +// should surface this as a 400 rather than a 500. +var ErrResourceLimit = errors.New("metrics query exceeded resource limits") + +// ErrInvalidTimeRange is returned when the time range is invalid +// (e.g. start >= end). Callers should surface this as a 400. +var ErrInvalidTimeRange = errors.New("invalid time range: start must be before end") + +// ValidateMetricsRequest checks that the metrics request is well-formed. +func ValidateMetricsRequest(req MetricsRequest) error { + if !req.TimeRange.Start.Before(req.TimeRange.End) { + return ErrInvalidTimeRange + } + return nil +} + +type Metrics interface { + QueryEventMetrics(ctx context.Context, req MetricsRequest) (*EventMetricsResponse, error) + QueryAttemptMetrics(ctx context.Context, req MetricsRequest) (*AttemptMetricsResponse, error) +} + +type TimeRange struct { + Start time.Time + End time.Time +} + +// Granularity defines the time-bucketing interval for metrics queries. +// For sub-day units (s, m, h), Value controls both step size and alignment +// (e.g. 5m → buckets at :00, :05, :10, …). +// For calendar units with Value=1, alignment is to the natural period start +// (start of day, Sunday-based week, or first of month). +// For calendar units with Value>1, alignment uses epoch-anchored intervals +// (d/w from 1970-01-01/1970-01-04, M from Jan 1970) so that multi-day, +// multi-week, and multi-month granularities aggregate data correctly. +type Granularity struct { + Value int + Unit string // s, m, h, d, w, M +} + +type MetricsRequest struct { + TimeRange TimeRange + Granularity *Granularity + Measures []string + Dimensions []string + Filters map[string][]string +} + +type MetricsMetadata struct { + QueryTimeMs int64 + RowCount int + RowLimit int + Truncated bool +} + +// Event metrics + +type EventMetricsDataPoint struct { + TimeBucket *time.Time + // Measures + Count *int + Rate *float64 + // Dimensions + TenantID *string + Topic *string + DestinationID *string +} + +type EventMetricsResponse struct { + Data []EventMetricsDataPoint + Metadata MetricsMetadata +} + +// Attempt metrics + +type AttemptMetricsDataPoint struct { + TimeBucket *time.Time + // Measures + Count *int + SuccessfulCount *int + FailedCount *int + ErrorRate *float64 + // NOTE: The following three measures are equivalent to using "count" with + // the corresponding filters (attempt_number=1 AND manual=false, attempt_number>1, + // manual=true). They exist for composability — callers can request multiple + // measures in a single query instead of issuing separate filtered queries. + // Consider whether they're worth the added surface area or if callers should + // use count+filters. + FirstAttemptCount *int + RetryCount *int + ManualRetryCount *int + AvgAttemptNumber *float64 + Rate *float64 + SuccessfulRate *float64 + FailedRate *float64 + // Dimensions + TenantID *string + DestinationID *string + Topic *string + Status *string + Code *string + Manual *bool + AttemptNumber *int +} + +type AttemptMetricsResponse struct { + Data []AttemptMetricsDataPoint + Metadata MetricsMetadata +} diff --git a/internal/logstore/driver/rate.go b/internal/logstore/driver/rate.go new file mode 100644 index 000000000..25ccbb96a --- /dev/null +++ b/internal/logstore/driver/rate.go @@ -0,0 +1,120 @@ +package driver + +import "time" + +// rateDependencies maps each derived rate measure to the count measure it requires. +var rateDependencies = map[string]string{ + "rate": "count", + "successful_rate": "successful_count", + "failed_rate": "failed_count", +} + +// EnrichMeasuresForRates returns a new measures slice with any missing rate +// dependencies appended. For example, if "rate" is requested but "count" is not, +// "count" is added so the SQL query computes it. +func EnrichMeasuresForRates(measures []string) []string { + seen := make(map[string]struct{}, len(measures)) + for _, m := range measures { + seen[m] = struct{}{} + } + + enriched := make([]string, len(measures)) + copy(enriched, measures) + + for _, m := range measures { + if dep, ok := rateDependencies[m]; ok { + if _, exists := seen[dep]; !exists { + enriched = append(enriched, dep) + seen[dep] = struct{}{} + } + } + } + return enriched +} + +// ComputeEventRates populates Rate fields on event data points from their +// corresponding count fields and the bucket duration. +func ComputeEventRates(data []EventMetricsDataPoint, req MetricsRequest) { + if !hasMeasure(req.Measures, "rate") { + return + } + for i := range data { + dp := &data[i] + dur := bucketDurationSeconds(dp.TimeBucket, req.Granularity, req.TimeRange) + v := float64(derefIntPtr(dp.Count)) / dur + dp.Rate = &v + } +} + +// ComputeAttemptRates populates Rate, SuccessfulRate, and FailedRate fields on +// attempt data points from their corresponding count fields and the bucket duration. +func ComputeAttemptRates(data []AttemptMetricsDataPoint, req MetricsRequest) { + wantRate := hasMeasure(req.Measures, "rate") + wantSuccessful := hasMeasure(req.Measures, "successful_rate") + wantFailed := hasMeasure(req.Measures, "failed_rate") + if !wantRate && !wantSuccessful && !wantFailed { + return + } + + for i := range data { + dp := &data[i] + dur := bucketDurationSeconds(dp.TimeBucket, req.Granularity, req.TimeRange) + if wantRate { + v := float64(derefIntPtr(dp.Count)) / dur + dp.Rate = &v + } + if wantSuccessful { + v := float64(derefIntPtr(dp.SuccessfulCount)) / dur + dp.SuccessfulRate = &v + } + if wantFailed { + v := float64(derefIntPtr(dp.FailedCount)) / dur + dp.FailedRate = &v + } + } +} + +// bucketDurationSeconds returns the duration of one time bucket in seconds. +func bucketDurationSeconds(timeBucket *time.Time, gran *Granularity, tr TimeRange) float64 { + if gran == nil { + return tr.End.Sub(tr.Start).Seconds() + } + switch gran.Unit { + case "s": + return float64(gran.Value) + case "m": + return float64(gran.Value) * 60 + case "h": + return float64(gran.Value) * 3600 + case "d": + return float64(gran.Value) * 86400 + case "w": + return float64(gran.Value) * 7 * 86400 + case "M": + if timeBucket != nil { + t := timeBucket.UTC() + start := time.Date(t.Year(), t.Month(), 1, 0, 0, 0, 0, time.UTC) + end := start.AddDate(0, gran.Value, 0) + return end.Sub(start).Seconds() + } + return float64(gran.Value) * 30 * 86400 + default: + return tr.End.Sub(tr.Start).Seconds() + } +} + +func hasMeasure(measures []string, m string) bool { + for _, v := range measures { + if v == m { + return true + } + } + return false +} + +func derefIntPtr(p *int) int { + if p != nil { + return *p + } + return 0 +} diff --git a/internal/logstore/drivertest/drivertest.go b/internal/logstore/drivertest/drivertest.go index 2c7d27727..5f71a5ff2 100644 --- a/internal/logstore/drivertest/drivertest.go +++ b/internal/logstore/drivertest/drivertest.go @@ -39,4 +39,7 @@ func RunConformanceTests(t *testing.T, newHarness HarnessMaker) { t.Run("Misc", func(t *testing.T) { testMisc(t, newHarness) }) + t.Run("Metrics", func(t *testing.T) { + testMetrics(t, newHarness) + }) } diff --git a/internal/logstore/drivertest/metrics.go b/internal/logstore/drivertest/metrics.go new file mode 100644 index 000000000..6dd99469b --- /dev/null +++ b/internal/logstore/drivertest/metrics.go @@ -0,0 +1,33 @@ +package drivertest + +import ( + "context" + "testing" + + "github.com/stretchr/testify/require" +) + +func testMetrics(t *testing.T, newHarness HarnessMaker) { + t.Helper() + + ctx := context.Background() + h, err := newHarness(ctx, t) + require.NoError(t, err) + t.Cleanup(h.Close) + + logStore, err := h.MakeDriver(ctx) + require.NoError(t, err) + + // Build and seed the dataset (shared across both sub-suites). + ds := buildMetricsDataset() + err = logStore.InsertMany(ctx, ds.entries) + require.NoError(t, err) + require.NoError(t, h.FlushWrites(ctx)) + + t.Run("DataCorrectness", func(t *testing.T) { + testMetricsDataCorrectness(t, ctx, logStore, ds) + }) + t.Run("Characteristics", func(t *testing.T) { + testMetricsCharacteristics(t, ctx, logStore, ds) + }) +} diff --git a/internal/logstore/drivertest/metrics_characteristics.go b/internal/logstore/drivertest/metrics_characteristics.go new file mode 100644 index 000000000..095f7ac7e --- /dev/null +++ b/internal/logstore/drivertest/metrics_characteristics.go @@ -0,0 +1,454 @@ +package drivertest + +import ( + "context" + "testing" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// testMetricsCharacteristics asserts structural properties of the time-series +// response contract (dense bucket filling, ordering, alignment, etc.). +// These tests are independent of specific metric values — they validate the +// shape of the response that dashboard consumers depend on. +// +// Uses the shared dataset from metrics_dataset.go. Key assumptions: +// - Dense day (Jan 15): data in hours 10-14 only, 250 events +// - Full range: Jan 2000 (sparse days 3,7,11,22,28 + dense day 15) +// - 3 topics cycling across all events +func testMetricsCharacteristics(t *testing.T, ctx context.Context, logStore driver.LogStore, ds *metricsDataset) { + // ── 1. Empty bucket filling ────────────────────────────────────────── + // Dense day (Jan 15) has data in hours 10-14 only. With 1h granularity + // over the full day, all 24 hours must be present. + + t.Run("empty bucket filling (events)", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.denseDayRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "rate"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 24, "24h range with 1h granularity must produce 24 buckets") + + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket, "every bucket must have a time_bucket") + require.NotNil(t, dp.Count, "every bucket must have a count") + require.NotNil(t, dp.Rate, "every bucket must have a rate") + h := dp.TimeBucket.Hour() + if h < 10 || h > 14 { + assert.Equal(t, 0, *dp.Count, "hour %d should have count=0", h) + assert.Equal(t, 0.0, *dp.Rate, "hour %d should have rate=0.0", h) + } else { + assert.Greater(t, *dp.Count, 0, "hour %d should have count>0", h) + assert.Greater(t, *dp.Rate, 0.0, "hour %d should have rate>0", h) + } + } + }) + + t.Run("empty bucket filling (attempts)", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.denseDayRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "error_rate", "rate", "successful_rate", "failed_rate"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 24, "24h range with 1h granularity must produce 24 buckets") + + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + require.NotNil(t, dp.ErrorRate, "error_rate must be present in every bucket") + require.NotNil(t, dp.Rate, "rate must be present in every bucket") + require.NotNil(t, dp.SuccessfulRate, "successful_rate must be present in every bucket") + require.NotNil(t, dp.FailedRate, "failed_rate must be present in every bucket") + h := dp.TimeBucket.Hour() + if h < 10 || h > 14 { + assert.Equal(t, 0, *dp.Count, "hour %d should have count=0", h) + assert.Equal(t, 0.0, *dp.ErrorRate, "hour %d should have error_rate=0.0", h) + assert.Equal(t, 0.0, *dp.Rate, "hour %d should have rate=0.0", h) + assert.Equal(t, 0.0, *dp.SuccessfulRate, "hour %d should have successful_rate=0.0", h) + assert.Equal(t, 0.0, *dp.FailedRate, "hour %d should have failed_rate=0.0", h) + } + } + }) + + // ── 2. Chronological ordering ──────────────────────────────────────── + // Buckets must be sorted by time_bucket ASC. + + t.Run("chronological ordering (events)", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.timeRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.True(t, len(resp.Data) > 1, "need multiple buckets to test ordering") + + for i := 1; i < len(resp.Data); i++ { + require.NotNil(t, resp.Data[i-1].TimeBucket) + require.NotNil(t, resp.Data[i].TimeBucket) + assert.True(t, + resp.Data[i-1].TimeBucket.Before(*resp.Data[i].TimeBucket), + "bucket %d (%s) must be before bucket %d (%s)", + i-1, resp.Data[i-1].TimeBucket, i, resp.Data[i].TimeBucket, + ) + } + }) + + t.Run("chronological ordering (attempts)", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.timeRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.True(t, len(resp.Data) > 1) + + for i := 1; i < len(resp.Data); i++ { + require.NotNil(t, resp.Data[i-1].TimeBucket) + require.NotNil(t, resp.Data[i].TimeBucket) + assert.True(t, + resp.Data[i-1].TimeBucket.Before(*resp.Data[i].TimeBucket), + "bucket %d (%s) must be before bucket %d (%s)", + i-1, resp.Data[i-1].TimeBucket, i, resp.Data[i].TimeBucket, + ) + } + }) + + // ── 3. Deterministic bucket count ──────────────────────────────────── + // The number of buckets depends only on the time range and granularity, + // never on the density of data. + + t.Run("deterministic bucket count", func(t *testing.T) { + cases := []struct { + name string + start time.Time + end time.Time + gran driver.Granularity + expected int + }{ + { + name: "24h at 1h", + start: time.Date(2000, 1, 15, 0, 0, 0, 0, time.UTC), + end: time.Date(2000, 1, 16, 0, 0, 0, 0, time.UTC), + gran: driver.Granularity{Value: 1, Unit: "h"}, + expected: 24, + }, + { + name: "7d at 1d", + start: time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC), + end: time.Date(2000, 1, 8, 0, 0, 0, 0, time.UTC), + gran: driver.Granularity{Value: 1, Unit: "d"}, + expected: 7, + }, + { + name: "1h at 1m", + start: time.Date(2000, 1, 15, 10, 0, 0, 0, time.UTC), + end: time.Date(2000, 1, 15, 11, 0, 0, 0, time.UTC), + gran: driver.Granularity{Value: 1, Unit: "m"}, + expected: 60, + }, + { + name: "1h at 5m", + start: time.Date(2000, 1, 15, 10, 0, 0, 0, time.UTC), + end: time.Date(2000, 1, 15, 11, 0, 0, 0, time.UTC), + gran: driver.Granularity{Value: 5, Unit: "m"}, + expected: 12, + }, + { + name: "granularity larger than range", + start: time.Date(2000, 1, 15, 10, 0, 0, 0, time.UTC), + end: time.Date(2000, 1, 15, 16, 0, 0, 0, time.UTC), + gran: driver.Granularity{Value: 1, Unit: "d"}, + expected: 1, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{Start: tc.start, End: tc.end}, + Granularity: &tc.gran, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, tc.expected, "expected %d buckets for %s", tc.expected, tc.name) + }) + } + }) + + // ── 4. Explicit zero measures ──────────────────────────────────────── + // Empty buckets must have concrete zero values, never nil. + + t.Run("explicit zero measures (events)", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.denseDayRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "rate"}, + }) + require.NoError(t, err) + // Guard: need 24 buckets for this test to be meaningful (not vacuously pass). + require.Len(t, resp.Data, 24, "prerequisite: bucket filling must produce 24 buckets") + + for _, dp := range resp.Data { + if dp.TimeBucket != nil && (dp.TimeBucket.Hour() < 10 || dp.TimeBucket.Hour() > 14) { + require.NotNil(t, dp.Count, "count must not be nil in empty bucket at %s", dp.TimeBucket) + require.NotNil(t, dp.Rate, "rate must not be nil in empty bucket at %s", dp.TimeBucket) + assert.Equal(t, 0, *dp.Count) + assert.Equal(t, 0.0, *dp.Rate) + } + } + }) + + t.Run("explicit zero measures (attempts)", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.denseDayRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "successful_count", "failed_count", "error_rate", "first_attempt_count", "retry_count", "manual_retry_count", "avg_attempt_number", "rate", "successful_rate", "failed_rate"}, + }) + require.NoError(t, err) + // Guard: need 24 buckets for this test to be meaningful (not vacuously pass). + require.Len(t, resp.Data, 24, "prerequisite: bucket filling must produce 24 buckets") + + for _, dp := range resp.Data { + if dp.TimeBucket != nil && (dp.TimeBucket.Hour() < 10 || dp.TimeBucket.Hour() > 14) { + require.NotNil(t, dp.Count, "count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.SuccessfulCount, "successful_count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.FailedCount, "failed_count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.ErrorRate, "error_rate must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.FirstAttemptCount, "first_attempt_count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.RetryCount, "retry_count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.ManualRetryCount, "manual_retry_count must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.AvgAttemptNumber, "avg_attempt_number must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.Rate, "rate must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.SuccessfulRate, "successful_rate must not be nil at %s", dp.TimeBucket) + require.NotNil(t, dp.FailedRate, "failed_rate must not be nil at %s", dp.TimeBucket) + assert.Equal(t, 0, *dp.Count) + assert.Equal(t, 0, *dp.SuccessfulCount) + assert.Equal(t, 0, *dp.FailedCount) + assert.Equal(t, 0.0, *dp.ErrorRate, "error_rate must be 0.0, not NaN") + assert.Equal(t, 0, *dp.FirstAttemptCount) + assert.Equal(t, 0, *dp.RetryCount) + assert.Equal(t, 0, *dp.ManualRetryCount) + assert.Equal(t, 0.0, *dp.AvgAttemptNumber, "avg_attempt_number must be 0.0, not NaN") + assert.Equal(t, 0.0, *dp.Rate, "rate must be 0.0") + assert.Equal(t, 0.0, *dp.SuccessfulRate, "successful_rate must be 0.0") + assert.Equal(t, 0.0, *dp.FailedRate, "failed_rate must be 0.0") + } + } + }) + + // ── 5. No-data range returns full bucket series ────────────────────── + // Querying a range with zero matching events still produces the full + // bucket series, all with zero values. + + t.Run("no-data range (events)", func(t *testing.T) { + // Feb 2000 is 29 days (leap year). + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 2, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 3, 1, 0, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 29, "Feb 2000 (leap year) with 1d granularity must produce 29 buckets") + + for _, dp := range resp.Data { + require.NotNil(t, dp.Count) + assert.Equal(t, 0, *dp.Count, "all buckets in no-data range must be zero") + } + }) + + t.Run("no-data range (attempts)", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 2, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 3, 1, 0, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 29) + + for _, dp := range resp.Data { + require.NotNil(t, dp.Count) + assert.Equal(t, 0, *dp.Count) + } + }) + + // ── 6. Bucket alignment ───────────────────────────────────────────── + // When start doesn't fall on a granularity boundary, buckets still + // snap to the boundary (e.g., 1h → :00:00). + + t.Run("bucket alignment (1h)", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 1, 15, 3, 17, 42, 0, time.UTC), + End: time.Date(2000, 1, 15, 8, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.NotEmpty(t, resp.Data) + + // First bucket should be 03:00:00, not 03:17:42. + require.NotNil(t, resp.Data[0].TimeBucket) + assert.Equal(t, 3, resp.Data[0].TimeBucket.Hour()) + assert.Equal(t, 0, resp.Data[0].TimeBucket.Minute()) + assert.Equal(t, 0, resp.Data[0].TimeBucket.Second()) + + // All buckets must be on :00:00 boundaries. + for i, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + assert.Equal(t, 0, dp.TimeBucket.Minute(), "bucket %d minute must be 0", i) + assert.Equal(t, 0, dp.TimeBucket.Second(), "bucket %d second must be 0", i) + } + }) + + t.Run("bucket alignment (1d)", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 1, 3, 14, 30, 0, 0, time.UTC), + End: time.Date(2000, 1, 6, 0, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.NotEmpty(t, resp.Data) + + // First bucket should be Jan 3 00:00:00, not 14:30:00. + require.NotNil(t, resp.Data[0].TimeBucket) + assert.Equal(t, 3, resp.Data[0].TimeBucket.Day()) + assert.Equal(t, 0, resp.Data[0].TimeBucket.Hour()) + + // All buckets at midnight. + for i, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + assert.Equal(t, 0, dp.TimeBucket.Hour(), "bucket %d must be at midnight", i) + assert.Equal(t, 0, dp.TimeBucket.Minute(), "bucket %d must be at midnight", i) + } + }) + + // ── 7. Dimensions don't multiply empty buckets ─────────────────────── + // With dimensions, empty time slots are only filled for dimension + // combinations that actually appear in the data (within the full date + // range). We must not get a cartesian product of all dimensions × all + // time slots. + + t.Run("dimensions don't cartesian-explode empty buckets", func(t *testing.T) { + // Query dense day with 1h granularity and dimension=topic. + // 3 topics exist in the data, data spans hours 10-14. + // Expected: 5 hours with data × 3 topics + 19 empty hours × (only topics + // that appear in data for the queried range, filled per-combo along time axis). + // + // The key invariant: we must NOT get rows for topic+hour combos where + // that specific topic never appears anywhere in the query range. + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: ds.denseDayRange.toDriver(), + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count"}, + Dimensions: []string{"topic"}, + }) + require.NoError(t, err) + + // Count unique topics in the response. + topics := map[string]bool{} + for _, dp := range resp.Data { + if dp.Topic != nil { + topics[*dp.Topic] = true + } + } + numTopics := len(topics) + + // Each topic that appears should have exactly 24 buckets (one per hour). + assert.Len(t, resp.Data, numTopics*24, + "each topic must have 24 hourly buckets (dense filling per dimension combo)") + + // Verify zero-filled buckets exist for each topic in empty hours. + type topicHour struct { + topic string + hour int + } + counts := map[topicHour]int{} + for _, dp := range resp.Data { + if dp.Topic != nil && dp.TimeBucket != nil && dp.Count != nil { + counts[topicHour{*dp.Topic, dp.TimeBucket.Hour()}] = *dp.Count + } + } + for topic := range topics { + for h := range 24 { + _, ok := counts[topicHour{topic, h}] + assert.True(t, ok, "topic=%s hour=%d must have a bucket", topic, h) + } + } + }) + + // ── 8. Too many buckets → ErrResourceLimit ────────────────────────── + // A granularity + time range that exceeds 100k buckets must be rejected. + + t.Run("too many buckets returns ErrResourceLimit (events)", func(t *testing.T) { + // 1s granularity over ~2 days = 172800 buckets > 100k limit + _, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 3, 0, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "s"}, + Measures: []string{"count"}, + }) + require.Error(t, err) + assert.ErrorIs(t, err, driver.ErrResourceLimit) + }) + + t.Run("too many buckets returns ErrResourceLimit (attempts)", func(t *testing.T) { + _, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 3, 0, 0, 0, 0, time.UTC), + }, + Granularity: &driver.Granularity{Value: 1, Unit: "s"}, + Measures: []string{"count"}, + }) + require.Error(t, err) + assert.ErrorIs(t, err, driver.ErrResourceLimit) + }) + + // ── 9. No granularity → no bucket filling ──────────────────────────── + // When granularity is omitted, bucket filling does not apply. + // Empty results remain empty (single aggregate row or none). + + t.Run("no granularity no filling", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(2000, 2, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(2000, 3, 1, 0, 0, 0, 0, time.UTC), + }, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Empty(t, resp.Data, "without granularity, no-data range should return empty") + }) +} diff --git a/internal/logstore/drivertest/metrics_data_correctness.go b/internal/logstore/drivertest/metrics_data_correctness.go new file mode 100644 index 000000000..c7d3a349c --- /dev/null +++ b/internal/logstore/drivertest/metrics_data_correctness.go @@ -0,0 +1,634 @@ +package drivertest + +import ( + "context" + "testing" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/hookdeck/outpost/internal/util/testutil" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func testMetricsDataCorrectness(t *testing.T, ctx context.Context, logStore driver.LogStore, ds *metricsDataset) { + fullRange := ds.timeRange.toDriver() + denseRange := ds.denseDayRange.toDriver() + + // ── Event Metrics ────────────────────────────────────────────────── + + t.Run("EventMetrics", func(t *testing.T) { + t.Run("count all", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 300, *resp.Data[0].Count) + }) + + t.Run("by topic", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"topic"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 3) + + tc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.Topic) + require.NotNil(t, dp.Count) + tc[*dp.Topic] = *dp.Count + } + assert.Equal(t, 100, tc[testutil.TestTopics[0]]) // user.created + assert.Equal(t, 100, tc[testutil.TestTopics[1]]) // user.deleted + assert.Equal(t, 100, tc[testutil.TestTopics[2]]) // user.updated + }) + + t.Run("by destination_id", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"destination_id"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 2) + + dc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.DestinationID) + require.NotNil(t, dp.Count) + dc[*dp.DestinationID] = *dp.Count + } + assert.Equal(t, 150, dc[ds.dest1_1]) + assert.Equal(t, 150, dc[ds.dest1_2]) + }) + + t.Run("by tenant_id", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"tenant_id"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 2) + + tc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.TenantID) + require.NotNil(t, dp.Count) + tc[*dp.TenantID] = *dp.Count + } + assert.Equal(t, 300, tc[ds.tenant1]) + assert.Equal(t, 5, tc[ds.tenant2]) + }) + + t.Run("filter by topic", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "topic": {testutil.TestTopics[0]}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 100, *resp.Data[0].Count) + }) + + t.Run("filter by destination_id", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "destination_id": {ds.dest1_1}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 150, *resp.Data[0].Count) + }) + + t.Run("tenant isolation", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant2}}, + TimeRange: fullRange, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 5, *resp.Data[0].Count) + }) + + t.Run("empty time range", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: driver.TimeRange{ + Start: time.Date(1999, 1, 1, 0, 0, 0, 0, time.UTC), + End: time.Date(1999, 2, 1, 0, 0, 0, 0, time.UTC), + }, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Empty(t, resp.Data) + }) + + t.Run("rate no granularity", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"rate"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Rate) + // rate = count / total_seconds = 300 / (31 days * 86400) + assert.InDelta(t, 300.0/2678400.0, *resp.Data[0].Rate, 0.0000001) + }) + + t.Run("rate with 1h granularity on dense day", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: denseRange, + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "rate"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 24) + + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + require.NotNil(t, dp.Rate) + // rate = count / 3600 (1h bucket) + expected := float64(*dp.Count) / 3600.0 + assert.InDelta(t, expected, *dp.Rate, 0.0000001, + "hour %d: rate should be count/3600", dp.TimeBucket.Hour()) + } + }) + + t.Run("granularity 1M", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Granularity: &driver.Granularity{Value: 1, Unit: "M"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].TimeBucket) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 300, *resp.Data[0].Count) + }) + + t.Run("granularity 1w", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Granularity: &driver.Granularity{Value: 1, Unit: "w"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.NotEmpty(t, resp.Data) + + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + total += *dp.Count + } + assert.Equal(t, 300, total) + }) + + t.Run("granularity 2d preserves total count", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Granularity: &driver.Granularity{Value: 2, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.NotEmpty(t, resp.Data) + + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + total += *dp.Count + } + // All 300 events must be accounted for — none silently dropped. + assert.Equal(t, 300, total) + }) + + t.Run("granularity 1d on dense day range", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: denseRange, + Granularity: &driver.Granularity{Value: 1, Unit: "d"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 250, *resp.Data[0].Count) + }) + + t.Run("granularity 1h on dense day", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: denseRange, + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 24) + + hourly := map[int]int{} + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + hourly[dp.TimeBucket.Hour()] = *dp.Count + total += *dp.Count + } + assert.Equal(t, 25, hourly[10]) + assert.Equal(t, 50, hourly[11]) + assert.Equal(t, 100, hourly[12]) + assert.Equal(t, 50, hourly[13]) + assert.Equal(t, 25, hourly[14]) + assert.Equal(t, 250, total) + }) + + t.Run("granularity 1m on dense day hour 10", func(t *testing.T) { + hour10Range := driver.TimeRange{ + Start: time.Date(2000, 1, 15, 10, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 15, 11, 0, 0, 0, time.UTC), + } + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: hour10Range, + Granularity: &driver.Granularity{Value: 1, Unit: "m"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + // 60 minutes in the hour, bucket filling produces all 60 + assert.Len(t, resp.Data, 60) + + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + total += *dp.Count + } + assert.Equal(t, 25, total) + }) + + t.Run("granularity 1m on dense day hour 12", func(t *testing.T) { + hour12Range := driver.TimeRange{ + Start: time.Date(2000, 1, 15, 12, 0, 0, 0, time.UTC), + End: time.Date(2000, 1, 15, 13, 0, 0, 0, time.UTC), + } + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: hour12Range, + Granularity: &driver.Granularity{Value: 1, Unit: "m"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + // 60 minutes in the hour, bucket filling produces all 60 + assert.Len(t, resp.Data, 60) + + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + total += *dp.Count + } + assert.Equal(t, 100, total) + }) + }) + + // ── Attempt Metrics ──────────────────────────────────────────────── + + t.Run("AttemptMetrics", func(t *testing.T) { + t.Run("count all", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 300, *resp.Data[0].Count) + }) + + t.Run("successful and failed counts", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count", "successful_count", "failed_count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + dp := resp.Data[0] + require.NotNil(t, dp.Count) + require.NotNil(t, dp.SuccessfulCount) + require.NotNil(t, dp.FailedCount) + assert.Equal(t, 300, *dp.Count) + assert.Equal(t, 180, *dp.SuccessfulCount) + assert.Equal(t, 120, *dp.FailedCount) + }) + + t.Run("error rate", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"error_rate"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].ErrorRate) + assert.InDelta(t, 0.4, *resp.Data[0].ErrorRate, 0.001) + }) + + t.Run("retry measures", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"first_attempt_count", "retry_count", "manual_retry_count", "avg_attempt_number"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + dp := resp.Data[0] + require.NotNil(t, dp.FirstAttemptCount) + require.NotNil(t, dp.RetryCount) + require.NotNil(t, dp.ManualRetryCount) + require.NotNil(t, dp.AvgAttemptNumber) + assert.Equal(t, 75, *dp.FirstAttemptCount) + assert.Equal(t, 225, *dp.RetryCount) + assert.Equal(t, 30, *dp.ManualRetryCount) + assert.InDelta(t, 2.5, *dp.AvgAttemptNumber, 0.001) + }) + + t.Run("rate no granularity", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"rate", "successful_rate", "failed_rate"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + dp := resp.Data[0] + require.NotNil(t, dp.Rate) + require.NotNil(t, dp.SuccessfulRate) + require.NotNil(t, dp.FailedRate) + // total_seconds = 31 days * 86400 = 2678400 + assert.InDelta(t, 300.0/2678400.0, *dp.Rate, 0.0000001) + assert.InDelta(t, 180.0/2678400.0, *dp.SuccessfulRate, 0.0000001) + assert.InDelta(t, 120.0/2678400.0, *dp.FailedRate, 0.0000001) + }) + + t.Run("rate with 1h granularity on dense day", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: denseRange, + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count", "rate"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 24) + + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + require.NotNil(t, dp.Rate) + expected := float64(*dp.Count) / 3600.0 + assert.InDelta(t, expected, *dp.Rate, 0.0000001, + "hour %d: rate should be count/3600", dp.TimeBucket.Hour()) + } + }) + + t.Run("by status", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"status"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 2) + + sc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.Status) + require.NotNil(t, dp.Count) + sc[*dp.Status] = *dp.Count + } + assert.Equal(t, 180, sc["success"]) + assert.Equal(t, 120, sc["failed"]) + }) + + t.Run("by destination_id", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"destination_id"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 2) + + dc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.DestinationID) + require.NotNil(t, dp.Count) + dc[*dp.DestinationID] = *dp.Count + } + assert.Equal(t, 150, dc[ds.dest1_1]) + assert.Equal(t, 150, dc[ds.dest1_2]) + }) + + t.Run("by tenant_id", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"tenant_id"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 2) + + tc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.TenantID) + require.NotNil(t, dp.Count) + tc[*dp.TenantID] = *dp.Count + } + assert.Equal(t, 300, tc[ds.tenant1]) + assert.Equal(t, 5, tc[ds.tenant2]) + }) + + t.Run("by attempt_number", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"attempt_number"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 4) + + ac := map[int]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.AttemptNumber) + require.NotNil(t, dp.Count) + ac[*dp.AttemptNumber] = *dp.Count + } + // attempt_number = i % 4 + 1 → each value appears 75 times + assert.Equal(t, 75, ac[1]) + assert.Equal(t, 75, ac[2]) + assert.Equal(t, 75, ac[3]) + assert.Equal(t, 75, ac[4]) + }) + + t.Run("by code", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + Dimensions: []string{"code"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 4) + + cc := map[string]int{} + for _, dp := range resp.Data { + require.NotNil(t, dp.Code) + require.NotNil(t, dp.Count) + cc[*dp.Code] = *dp.Count + } + assert.Equal(t, 90, cc["200"]) + assert.Equal(t, 90, cc["201"]) + assert.Equal(t, 60, cc["500"]) + assert.Equal(t, 60, cc["422"]) + }) + + t.Run("filter by status", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "status": {"failed"}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 120, *resp.Data[0].Count) + }) + + t.Run("filter by topic", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "topic": {testutil.TestTopics[0]}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 100, *resp.Data[0].Count) + }) + + t.Run("filter by code", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "code": {"500"}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 60, *resp.Data[0].Count) + }) + + t.Run("filter by manual", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "manual": {"true"}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 30, *resp.Data[0].Count) + }) + + t.Run("filter by attempt_number", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + TimeRange: fullRange, + Measures: []string{"count"}, + Filters: map[string][]string{"tenant_id": {ds.tenant1}, "attempt_number": {"1"}}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 75, *resp.Data[0].Count) + }) + + t.Run("granularity 1h on dense day", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: denseRange, + Granularity: &driver.Granularity{Value: 1, Unit: "h"}, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Len(t, resp.Data, 24) + + hourly := map[int]int{} + total := 0 + for _, dp := range resp.Data { + require.NotNil(t, dp.TimeBucket) + require.NotNil(t, dp.Count) + hourly[dp.TimeBucket.Hour()] = *dp.Count + total += *dp.Count + } + assert.Equal(t, 25, hourly[10]) + assert.Equal(t, 50, hourly[11]) + assert.Equal(t, 100, hourly[12]) + assert.Equal(t, 50, hourly[13]) + assert.Equal(t, 25, hourly[14]) + assert.Equal(t, 250, total) + }) + + t.Run("tenant isolation", func(t *testing.T) { + resp, err := logStore.QueryAttemptMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant2}}, + TimeRange: fullRange, + Measures: []string{"count"}, + }) + require.NoError(t, err) + require.Len(t, resp.Data, 1) + require.NotNil(t, resp.Data[0].Count) + assert.Equal(t, 5, *resp.Data[0].Count) + }) + }) + + // ── Metadata ─────────────────────────────────────────────────────── + + t.Run("Metadata", func(t *testing.T) { + resp, err := logStore.QueryEventMetrics(ctx, driver.MetricsRequest{ + Filters: map[string][]string{"tenant_id": {ds.tenant1}}, + TimeRange: fullRange, + Measures: []string{"count"}, + }) + require.NoError(t, err) + assert.Equal(t, 1, resp.Metadata.RowCount) + assert.False(t, resp.Metadata.Truncated) + assert.Greater(t, resp.Metadata.RowLimit, 0) + }) +} diff --git a/internal/logstore/drivertest/metrics_dataset.go b/internal/logstore/drivertest/metrics_dataset.go new file mode 100644 index 000000000..f4ae8ae5f --- /dev/null +++ b/internal/logstore/drivertest/metrics_dataset.go @@ -0,0 +1,276 @@ +package drivertest + +import ( + "fmt" + "time" + + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/hookdeck/outpost/internal/models" + "github.com/hookdeck/outpost/internal/util/testutil" +) + +// ============================================================================ +// Metrics Test Dataset +// ============================================================================ +// +// Time range: 2000-01-01 00:00 UTC → 2000-02-01 00:00 UTC (full January) +// +// Two tenants: +// - Tenant 1: 300 events/attempts (the main dataset) +// - Tenant 2: 5 events/attempts (isolation checks only) +// +// Destinations: +// - Tenant 1: dest_1.1, dest_1.2 +// - Tenant 2: dest_2.1 +// +// IDs: +// - Tenants: "m_tenant_1", "m_tenant_2" +// - Destinations: "m_dest_1.1", "m_dest_1.2", "m_dest_2.1" +// - Events: "m_evt_1_{idx}", "m_evt_2_{idx}" +// - Attempts: "m_att_1_{idx}", "m_att_2_{idx}" +// +// ── Tenant 1 Distribution ──────────────────────────────────────────────── +// +// Sparse days (50 events total, 5 days × 10 events each): +// Jan 3 — 10 events, one per hour 09:00–18:00 +// Jan 7 — 10 events, one per hour 09:00–18:00 +// Jan 11 — 10 events, one per hour 09:00–18:00 +// Jan 22 — 10 events, one per hour 09:00–18:00 +// Jan 28 — 10 events, one per hour 09:00–18:00 +// +// Dense day — Jan 15 (250 events): +// Bell-curve distribution across 5 hours: +// 10:00–10:59 → 25 events +// 11:00–11:59 → 50 events +// 12:00–12:59 → 100 events +// 13:00–13:59 → 50 events +// 14:00–14:59 → 25 events +// +// Within each hour, events are spread evenly using: +// offset = i * 3600s / countForHour (seconds from top of hour) +// +// ── Dimension Cycling (Tenant 1) ───────────────────────────────────────── +// +// All 300 events are numbered 0–299 in insertion order. Dimensions cycle: +// +// topic: i % 3 → 0=user.created, 1=user.deleted, 2=user.updated +// destination: i % 2 → 0=dest_1.1, 1=dest_1.2 +// status: i % 5 → 0,1,2=success, 3,4=failed +// code: success → i%2==0 ? "200" : "201" +// failed → i%2==0 ? "500" : "422" +// attempt_number: i % 4 + 1 → 1,2,3,4 +// manual: i % 10 == 9 +// eligible_for_retry: i % 3 != 2 +// +// FIXME: manual retries should always have attempt_number=1 (they start a new +// chain), but this dataset assigns them independently. Update the formula so +// manual=true implies attempt_number=1, then fix derived totals and assertions. +// +// ── Derived Totals (Tenant 1, all 300) ─────────────────────────────────── +// +// Event metrics: +// count: 300 +// by topic: user.created=100, user.deleted=100, user.updated=100 +// by destination: dest_1.1=150, dest_1.2=150 +// by eligible_for_retry: true=200, false=100 +// +// Attempt metrics: +// count: 300 +// successful (i%5 in {0,1,2}): 180 +// failed (i%5 in {3,4}): 120 +// error_rate: 120/300 = 0.4 +// rate (no gran): 300/2678400 (per second over full range) +// successful_rate (no gran): 180/2678400 +// failed_rate (no gran): 120/2678400 +// by code: 200=90, 201=90, 500=60, 422=60 +// first_attempt (i%4+1==1): 75 +// retry (i%4+1>1): 225 +// manual (i%10==9): 30 +// avg_attempt_number: 750/300 = 2.5 +// +// Dense day — Jan 15 (250 events, indices 50..299): +// hourly buckets: 10:00→25, 11:00→50, 12:00→100, 13:00→50, 14:00→25 +// +// ── Tenant 2 ───────────────────────────────────────────────────────────── +// +// 5 events, all topic=user.created, dest=dest_2.1, status=success, code=200, +// attempt_number=1, manual=false, eligible_for_retry=true +// +// Jan 5 09:00, Jan 10 09:00, Jan 15 12:15, Jan 22 09:00, Jan 27 09:00 +// +// ============================================================================ + +const ( + mTenant1 = "m_tenant_1" + mTenant2 = "m_tenant_2" + mDest1_1 = "m_dest_1.1" + mDest1_2 = "m_dest_1.2" + mDest2_1 = "m_dest_2.1" +) + +// metricsDataset holds the seeded data and pre-computed constants for assertions. +type metricsDataset struct { + tenant1 string + tenant2 string + dest1_1 string // tenant 1's first destination + dest1_2 string // tenant 1's second destination + dest2_1 string // tenant 2's destination + entries []*models.LogEntry + + // Full time range covering all data. + timeRange timeRange + + // Dense day time range (Jan 15 only). + denseDayRange timeRange +} + +type timeRange struct { + start time.Time + end time.Time +} + +func (d timeRange) toDriver() driver.TimeRange { + return driver.TimeRange{Start: d.start, End: d.end} +} + +var ( + // January 2000 + dsStart = time.Date(2000, 1, 1, 0, 0, 0, 0, time.UTC) + dsEnd = time.Date(2000, 2, 1, 0, 0, 0, 0, time.UTC) + + // Dense day + dsDenseDay = time.Date(2000, 1, 15, 0, 0, 0, 0, time.UTC) +) + +// Sparse days: 5 days × 10 events each = 50 events. +// Each day has 10 events, one per hour from 09:00–18:00. +var sparseDays = []int{3, 7, 11, 22, 28} + +// Dense day hourly distribution (bell curve, total=250). +var denseHours = []struct { + hour int + count int +}{ + {10, 25}, + {11, 50}, + {12, 100}, + {13, 50}, + {14, 25}, +} + +func buildMetricsDataset() *metricsDataset { + topics := testutil.TestTopics // [user.created, user.deleted, user.updated] — sorted + codes := map[string][2]string{ + "success": {"200", "201"}, + "failed": {"500", "422"}, + } + + var entries []*models.LogEntry + idx := 0 + + makeEntry := func(tenant string, eventTime time.Time) *models.LogEntry { + dest := mDest1_1 + if idx%2 == 1 { + dest = mDest1_2 + } + topic := topics[idx%3] + status := "success" + if idx%5 == 3 || idx%5 == 4 { + status = "failed" + } + code := codes[status][idx%2] + attemptNum := idx%4 + 1 + manual := idx%10 == 9 + eligible := idx%3 != 2 + + event := testutil.EventFactory.AnyPointer( + testutil.EventFactory.WithID(fmt.Sprintf("m_evt_1_%d", idx)), + testutil.EventFactory.WithTenantID(tenant), + testutil.EventFactory.WithDestinationID(dest), + testutil.EventFactory.WithTopic(topic), + testutil.EventFactory.WithTime(eventTime), + testutil.EventFactory.WithEligibleForRetry(eligible), + ) + attempt := testutil.AttemptFactory.AnyPointer( + testutil.AttemptFactory.WithID(fmt.Sprintf("m_att_1_%d", idx)), + testutil.AttemptFactory.WithTenantID(tenant), + testutil.AttemptFactory.WithEventID(event.ID), + testutil.AttemptFactory.WithDestinationID(dest), + testutil.AttemptFactory.WithStatus(status), + testutil.AttemptFactory.WithCode(code), + testutil.AttemptFactory.WithTime(eventTime.Add(time.Millisecond)), + testutil.AttemptFactory.WithAttemptNumber(attemptNum), + testutil.AttemptFactory.WithManual(manual), + ) + + idx++ + return &models.LogEntry{Event: event, Attempt: attempt} + } + + // ── Sparse days (indices 0–49): 5 days × 10 events ── + for _, day := range sparseDays { + for j := range 10 { + t := time.Date(2000, 1, day, 9+j, 0, 0, 0, time.UTC) + entries = append(entries, makeEntry(mTenant1, t)) + } + } + + // ── Dense day — Jan 15 (indices 50–299): 250 events ── + for _, dh := range denseHours { + for i := range dh.count { + offsetSec := i * 3600 / dh.count + t := time.Date(2000, 1, 15, dh.hour, 0, 0, 0, time.UTC).Add( + time.Duration(offsetSec) * time.Second, + ) + entries = append(entries, makeEntry(mTenant1, t)) + } + } + + // ── Tenant 2 (5 events, independent — not using makeEntry/idx) ── + tenant2Times := []time.Time{ + time.Date(2000, 1, 5, 9, 0, 0, 0, time.UTC), + time.Date(2000, 1, 10, 9, 0, 0, 0, time.UTC), + time.Date(2000, 1, 15, 12, 15, 0, 0, time.UTC), + time.Date(2000, 1, 22, 9, 0, 0, 0, time.UTC), + time.Date(2000, 1, 27, 9, 0, 0, 0, time.UTC), + } + for i, bt := range tenant2Times { + event := testutil.EventFactory.AnyPointer( + testutil.EventFactory.WithID(fmt.Sprintf("m_evt_2_%d", i)), + testutil.EventFactory.WithTenantID(mTenant2), + testutil.EventFactory.WithDestinationID(mDest2_1), + testutil.EventFactory.WithTopic(topics[0]), + testutil.EventFactory.WithTime(bt), + testutil.EventFactory.WithEligibleForRetry(true), + ) + attempt := testutil.AttemptFactory.AnyPointer( + testutil.AttemptFactory.WithID(fmt.Sprintf("m_att_2_%d", i)), + testutil.AttemptFactory.WithTenantID(mTenant2), + testutil.AttemptFactory.WithEventID(event.ID), + testutil.AttemptFactory.WithDestinationID(mDest2_1), + testutil.AttemptFactory.WithStatus("success"), + testutil.AttemptFactory.WithCode("200"), + testutil.AttemptFactory.WithTime(bt.Add(time.Millisecond)), + testutil.AttemptFactory.WithAttemptNumber(1), + testutil.AttemptFactory.WithManual(false), + ) + entries = append(entries, &models.LogEntry{Event: event, Attempt: attempt}) + } + + return &metricsDataset{ + tenant1: mTenant1, + tenant2: mTenant2, + dest1_1: mDest1_1, + dest1_2: mDest1_2, + dest2_1: mDest2_1, + entries: entries, + timeRange: timeRange{ + start: dsStart, + end: dsEnd, + }, + denseDayRange: timeRange{ + start: dsDenseDay, + end: dsDenseDay.Add(24 * time.Hour), + }, + } +} diff --git a/internal/logstore/logstore.go b/internal/logstore/logstore.go index b0557792e..04a1b55ad 100644 --- a/internal/logstore/logstore.go +++ b/internal/logstore/logstore.go @@ -23,13 +23,16 @@ type RetrieveAttemptRequest = driver.RetrieveAttemptRequest type AttemptRecord = driver.AttemptRecord type LogEntry = models.LogEntry -type LogStore interface { - ListEvent(context.Context, ListEventRequest) (ListEventResponse, error) - ListAttempt(context.Context, ListAttemptRequest) (ListAttemptResponse, error) - RetrieveEvent(ctx context.Context, request RetrieveEventRequest) (*models.Event, error) - RetrieveAttempt(ctx context.Context, request RetrieveAttemptRequest) (*AttemptRecord, error) - InsertMany(context.Context, []*models.LogEntry) error -} +type MetricsRequest = driver.MetricsRequest +type MetricsMetadata = driver.MetricsMetadata +type TimeRange = driver.TimeRange +type Granularity = driver.Granularity +type EventMetricsDataPoint = driver.EventMetricsDataPoint +type EventMetricsResponse = driver.EventMetricsResponse +type AttemptMetricsDataPoint = driver.AttemptMetricsDataPoint +type AttemptMetricsResponse = driver.AttemptMetricsResponse + +type LogStore = driver.LogStore type DriverOpts struct { CH clickhouse.DB diff --git a/internal/logstore/memlogstore/metrics.go b/internal/logstore/memlogstore/metrics.go new file mode 100644 index 000000000..629b9bd2f --- /dev/null +++ b/internal/logstore/memlogstore/metrics.go @@ -0,0 +1,397 @@ +package memlogstore + +import ( + "context" + "fmt" + "time" + + "github.com/hookdeck/outpost/internal/logstore/bucket" + "github.com/hookdeck/outpost/internal/logstore/driver" + "github.com/hookdeck/outpost/internal/models" +) + +const defaultRowLimit = 100000 + +func (s *memLogStore) QueryEventMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.EventMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + s.mu.RLock() + defer s.mu.RUnlock() + + start := time.Now() + + // Filter events + var matched []*models.Event + for _, event := range s.events { + if !matchesEventMetricsFilter(event, req) { + continue + } + matched = append(matched, event) + } + + // Group by dimensions + time bucket + type groupKey struct { + timeBucket string + tenantID string + topic string + destID string + } + + groups := map[groupKey][]*models.Event{} + for _, event := range matched { + key := groupKey{} + if req.Granularity != nil { + tb := bucket.TruncateTime(event.Time, req.Granularity) + key.timeBucket = tb.Format(time.RFC3339) + } + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + key.tenantID = event.TenantID + case "topic": + key.topic = event.Topic + case "destination_id": + key.destID = event.DestinationID + } + } + groups[key] = append(groups[key], event) + } + + // Build response + var data []driver.EventMetricsDataPoint + for key, events := range groups { + dp := driver.EventMetricsDataPoint{} + + if key.timeBucket != "" { + tb, _ := time.Parse(time.RFC3339, key.timeBucket) + dp.TimeBucket = &tb + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + v := key.tenantID + dp.TenantID = &v + case "topic": + v := key.topic + dp.Topic = &v + case "destination_id": + v := key.destID + dp.DestinationID = &v + } + } + + // Measures + for _, measure := range req.Measures { + switch measure { + case "count": + c := len(events) + dp.Count = &c + } + } + + data = append(data, dp) + } + + // Handle empty result — no groups means no matching data + if len(groups) == 0 { + data = []driver.EventMetricsDataPoint{} + } + + data, fillErr := bucket.FillEventBuckets(data, req) + if fillErr != nil { + return nil, fmt.Errorf("fill event buckets: %w: %w", driver.ErrResourceLimit, fillErr) + } + driver.ComputeEventRates(data, req) + + elapsed := time.Since(start) + return &driver.EventMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: false, + }, + }, nil +} + +func (s *memLogStore) QueryAttemptMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.AttemptMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + s.mu.RLock() + defer s.mu.RUnlock() + + start := time.Now() + + var matched []attemptWithEvent + for _, a := range s.attempts { + event := s.events[a.EventID] + if event == nil { + continue + } + if !matchesAttemptMetricsFilter(a, event, req) { + continue + } + matched = append(matched, attemptWithEvent{attempt: a, event: event}) + } + + // Group by dimensions + time bucket + type groupKey struct { + timeBucket string + tenantID string + destID string + topic string + status string + code string + manual string + attemptNum string + } + + groups := map[groupKey][]attemptWithEvent{} + for _, ae := range matched { + key := groupKey{} + if req.Granularity != nil { + tb := bucket.TruncateTime(ae.attempt.Time, req.Granularity) + key.timeBucket = tb.Format(time.RFC3339) + } + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + key.tenantID = ae.event.TenantID + case "destination_id": + key.destID = ae.attempt.DestinationID + case "topic": + key.topic = ae.event.Topic + case "status": + key.status = ae.attempt.Status + case "code": + key.code = ae.attempt.Code + case "manual": + if ae.attempt.Manual { + key.manual = "true" + } else { + key.manual = "false" + } + case "attempt_number": + key.attemptNum = fmt.Sprintf("%d", ae.attempt.AttemptNumber) + } + } + groups[key] = append(groups[key], ae) + } + + // Build response + var data []driver.AttemptMetricsDataPoint + for key, attempts := range groups { + dp := driver.AttemptMetricsDataPoint{} + + if key.timeBucket != "" { + tb, _ := time.Parse(time.RFC3339, key.timeBucket) + dp.TimeBucket = &tb + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + v := key.tenantID + dp.TenantID = &v + case "destination_id": + v := key.destID + dp.DestinationID = &v + case "topic": + v := key.topic + dp.Topic = &v + case "status": + v := key.status + dp.Status = &v + case "code": + v := key.code + dp.Code = &v + case "manual": + v := key.manual == "true" + dp.Manual = &v + case "attempt_number": + v := attempts[0].attempt.AttemptNumber + dp.AttemptNumber = &v + } + } + + // Measures + for _, measure := range req.Measures { + switch measure { + case "count": + c := len(attempts) + dp.Count = &c + case "successful_count": + c := countByStatus(attempts, "success") + dp.SuccessfulCount = &c + case "failed_count": + c := countByStatus(attempts, "failed") + dp.FailedCount = &c + case "error_rate": + total := len(attempts) + failed := countByStatus(attempts, "failed") + var rate float64 + if total > 0 { + rate = float64(failed) / float64(total) + } + dp.ErrorRate = &rate + case "first_attempt_count": + c := 0 + for _, ae := range attempts { + if ae.attempt.AttemptNumber == 1 && !ae.attempt.Manual { + c++ + } + } + dp.FirstAttemptCount = &c + case "retry_count": + c := 0 + for _, ae := range attempts { + if ae.attempt.AttemptNumber > 1 { + c++ + } + } + dp.RetryCount = &c + case "manual_retry_count": + c := 0 + for _, ae := range attempts { + if ae.attempt.Manual { + c++ + } + } + dp.ManualRetryCount = &c + case "avg_attempt_number": + total := 0 + for _, ae := range attempts { + total += ae.attempt.AttemptNumber + } + var avg float64 + if len(attempts) > 0 { + avg = float64(total) / float64(len(attempts)) + } + dp.AvgAttemptNumber = &avg + } + } + + data = append(data, dp) + } + + if len(groups) == 0 { + data = []driver.AttemptMetricsDataPoint{} + } + + data, fillErr := bucket.FillAttemptBuckets(data, req) + if fillErr != nil { + return nil, fmt.Errorf("fill attempt buckets: %w: %w", driver.ErrResourceLimit, fillErr) + } + driver.ComputeAttemptRates(data, req) + + elapsed := time.Since(start) + return &driver.AttemptMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: false, + }, + }, nil +} + +func matchesEventMetricsFilter(event *models.Event, req driver.MetricsRequest) bool { + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + if !contains(tenantIDs, event.TenantID) { + return false + } + } + if event.Time.Before(req.TimeRange.Start) || !event.Time.Before(req.TimeRange.End) { + return false + } + if topics, ok := req.Filters["topic"]; ok { + if !contains(topics, event.Topic) { + return false + } + } + if dests, ok := req.Filters["destination_id"]; ok { + if !contains(dests, event.DestinationID) { + return false + } + } + return true +} + +func matchesAttemptMetricsFilter(a *models.Attempt, event *models.Event, req driver.MetricsRequest) bool { + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + if !contains(tenantIDs, event.TenantID) { + return false + } + } + if a.Time.Before(req.TimeRange.Start) || !a.Time.Before(req.TimeRange.End) { + return false + } + if statuses, ok := req.Filters["status"]; ok { + if !contains(statuses, a.Status) { + return false + } + } + if dests, ok := req.Filters["destination_id"]; ok { + if !contains(dests, a.DestinationID) { + return false + } + } + if topics, ok := req.Filters["topic"]; ok { + if !contains(topics, event.Topic) { + return false + } + } + if codes, ok := req.Filters["code"]; ok { + if !contains(codes, a.Code) { + return false + } + } + if manuals, ok := req.Filters["manual"]; ok { + manualStr := "false" + if a.Manual { + manualStr = "true" + } + if !contains(manuals, manualStr) { + return false + } + } + if attemptNums, ok := req.Filters["attempt_number"]; ok { + if !contains(attemptNums, fmt.Sprintf("%d", a.AttemptNumber)) { + return false + } + } + return true +} + +func contains(slice []string, val string) bool { + for _, s := range slice { + if s == val { + return true + } + } + return false +} + +func countByStatus(attempts []attemptWithEvent, status string) int { + c := 0 + for _, ae := range attempts { + if ae.attempt.Status == status { + c++ + } + } + return c +} + +type attemptWithEvent struct { + attempt *models.Attempt + event *models.Event +} diff --git a/internal/logstore/pglogstore/metrics.go b/internal/logstore/pglogstore/metrics.go new file mode 100644 index 000000000..d0d099c5b --- /dev/null +++ b/internal/logstore/pglogstore/metrics.go @@ -0,0 +1,553 @@ +package pglogstore + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/hookdeck/outpost/internal/logstore/bucket" + "github.com/hookdeck/outpost/internal/logstore/driver" +) + +const ( + defaultRowLimit = 100000 + metricsQueryTimeout = 30 * time.Second +) + +// metricsCtx adds a fallback timeout to the context if the caller didn't set +// one. When the deadline fires, pgx cancels the running statement on the +// PostgreSQL side via pg_cancel_backend. +func metricsCtx(ctx context.Context) (context.Context, context.CancelFunc) { + if _, ok := ctx.Deadline(); ok { + return ctx, func() {} // caller already set a deadline + } + return context.WithTimeout(ctx, metricsQueryTimeout) +} + +// timeBucketExpr returns a PostgreSQL expression that truncates a timestamptz +// column to the given granularity, matching the Go truncation semantics used +// in the in-memory driver. +func timeBucketExpr(col string, g *driver.Granularity) string { + switch g.Unit { + case "s": + return fmt.Sprintf("date_bin('%d seconds'::interval, %s, '2000-01-01T00:00:00Z'::timestamptz)", g.Value, col) + case "m": + return fmt.Sprintf("date_bin('%d minutes'::interval, %s, '2000-01-01T00:00:00Z'::timestamptz)", g.Value, col) + case "h": + return fmt.Sprintf("date_bin('%d hours'::interval, %s, '2000-01-01T00:00:00Z'::timestamptz)", g.Value, col) + case "d": + if g.Value == 1 { + return fmt.Sprintf("date_trunc('day', %s AT TIME ZONE 'UTC') AT TIME ZONE 'UTC'", col) + } + return fmt.Sprintf("date_bin('%d days'::interval, %s, '1970-01-01T00:00:00Z'::timestamptz)", g.Value, col) + case "w": + // Sunday-based weeks. Anchor to 1970-01-04 (Sunday) for consistent + // alignment matching Go's time.Weekday convention. + return fmt.Sprintf("date_bin('%d days'::interval, %s, '1970-01-04T00:00:00Z'::timestamptz)", g.Value*7, col) + case "M": + if g.Value == 1 { + return fmt.Sprintf("date_trunc('month', %s AT TIME ZONE 'UTC') AT TIME ZONE 'UTC'", col) + } + // PG date_bin doesn't support month intervals. Compute epoch-aligned + // month boundary: floor month number to nearest multiple of Value. + return fmt.Sprintf( + "('1970-01-01'::timestamptz + ((((EXTRACT(YEAR FROM %s AT TIME ZONE 'UTC')::int - 1970) * 12 + EXTRACT(MONTH FROM %s AT TIME ZONE 'UTC')::int - 1) / %d) * %d) * INTERVAL '1 month')", + col, col, g.Value, g.Value, + ) + default: + return col + } +} + +// ── Event Metrics ───────────────────────────────────────────────────────── + +func (s *logStore) QueryEventMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.EventMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + ctx, cancel := metricsCtx(ctx) + defer cancel() + + start := time.Now() + + var ( + selectExprs []string + groupExprs []string + conditions []string + args []any + argNum int + ) + + arg := func(v any) string { + argNum++ + args = append(args, v) + return fmt.Sprintf("$%d", argNum) + } + + // Track which fields appear in each result row so we can build matching + // scan destinations and map them back onto the data-point struct. + type sf int + const ( + sfTimeBucket sf = iota + sfTenantID + sfTopic + sfDestID + sfCount + ) + var order []sf + + // Time bucket + if req.Granularity != nil { + expr := timeBucketExpr("time", req.Granularity) + selectExprs = append(selectExprs, expr+" AS time_bucket") + groupExprs = append(groupExprs, expr) + order = append(order, sfTimeBucket) + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + selectExprs = append(selectExprs, "tenant_id") + groupExprs = append(groupExprs, "tenant_id") + order = append(order, sfTenantID) + case "topic": + selectExprs = append(selectExprs, "topic") + groupExprs = append(groupExprs, "topic") + order = append(order, sfTopic) + case "destination_id": + selectExprs = append(selectExprs, "destination_id") + groupExprs = append(groupExprs, "destination_id") + order = append(order, sfDestID) + } + } + + // Measures + for _, measure := range req.Measures { + switch measure { + case "count": + selectExprs = append(selectExprs, "COUNT(*)") + order = append(order, sfCount) + } + } + + // WHERE + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + conditions = append(conditions, "tenant_id = ANY("+arg(tenantIDs)+")") + } + conditions = append(conditions, "time >= "+arg(req.TimeRange.Start)) + conditions = append(conditions, "time < "+arg(req.TimeRange.End)) + + if topics, ok := req.Filters["topic"]; ok { + conditions = append(conditions, "topic = ANY("+arg(topics)+")") + } + if dests, ok := req.Filters["destination_id"]; ok { + conditions = append(conditions, "destination_id = ANY("+arg(dests)+")") + } + + // Build SQL + query := "SELECT " + strings.Join(selectExprs, ", ") + + " FROM events WHERE " + strings.Join(conditions, " AND ") + if len(groupExprs) > 0 { + query += " GROUP BY " + strings.Join(groupExprs, ", ") + } + query += " HAVING COUNT(*) > 0" + if len(groupExprs) > 0 { + query += " ORDER BY " + strings.Join(groupExprs, ", ") + } + query += fmt.Sprintf(" LIMIT %d", defaultRowLimit+1) + + rows, err := s.db.Query(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("query event metrics: %w", err) + } + defer rows.Close() + + // Prepare reusable scan destinations (one variable per possible field). + var ( + tbVal time.Time + tenantIDVal string + topicVal string + destIDVal string + countVal int + ) + scanDests := make([]any, len(order)) + for i, f := range order { + switch f { + case sfTimeBucket: + scanDests[i] = &tbVal + case sfTenantID: + scanDests[i] = &tenantIDVal + case sfTopic: + scanDests[i] = &topicVal + case sfDestID: + scanDests[i] = &destIDVal + case sfCount: + scanDests[i] = &countVal + } + } + + data := []driver.EventMetricsDataPoint{} + for rows.Next() { + if err := rows.Scan(scanDests...); err != nil { + return nil, fmt.Errorf("scan event metrics: %w", err) + } + + dp := driver.EventMetricsDataPoint{} + for _, f := range order { + switch f { + case sfTimeBucket: + t := tbVal.UTC() + dp.TimeBucket = &t + case sfTenantID: + v := tenantIDVal + dp.TenantID = &v + case sfTopic: + v := topicVal + dp.Topic = &v + case sfDestID: + v := destIDVal + dp.DestinationID = &v + case sfCount: + v := countVal + dp.Count = &v + } + } + data = append(data, dp) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows error: %w", err) + } + + truncated := len(data) > defaultRowLimit + if truncated { + data = data[:defaultRowLimit] + } + + data, err = bucket.FillEventBuckets(data, req) + if err != nil { + return nil, fmt.Errorf("fill event buckets: %w: %w", driver.ErrResourceLimit, err) + } + driver.ComputeEventRates(data, req) + + elapsed := time.Since(start) + return &driver.EventMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: truncated, + }, + }, nil +} + +// ── Attempt Metrics ─────────────────────────────────────────────────────── + +func (s *logStore) QueryAttemptMetrics(ctx context.Context, req driver.MetricsRequest) (*driver.AttemptMetricsResponse, error) { + if err := driver.ValidateMetricsRequest(req); err != nil { + return nil, err + } + req.Measures = driver.EnrichMeasuresForRates(req.Measures) + ctx, cancel := metricsCtx(ctx) + defer cancel() + + start := time.Now() + + var ( + selectExprs []string + groupExprs []string + conditions []string + args []any + argNum int + ) + + arg := func(v any) string { + argNum++ + args = append(args, v) + return fmt.Sprintf("$%d", argNum) + } + + type sf int + const ( + sfTimeBucket sf = iota + sfTenantID + sfDestID + sfTopic + sfStatus + sfCode + sfManual + sfAttemptNumber + sfCount + sfSuccessCount + sfFailedCount + sfErrorRate + sfFirstAttempt + sfRetryCount + sfManualRetry + sfAvgAttemptNum + ) + var order []sf + + // Time bucket + if req.Granularity != nil { + expr := timeBucketExpr("time", req.Granularity) + selectExprs = append(selectExprs, expr+" AS time_bucket") + groupExprs = append(groupExprs, expr) + order = append(order, sfTimeBucket) + } + + // Dimensions + for _, dim := range req.Dimensions { + switch dim { + case "tenant_id": + selectExprs = append(selectExprs, "tenant_id") + groupExprs = append(groupExprs, "tenant_id") + order = append(order, sfTenantID) + case "destination_id": + selectExprs = append(selectExprs, "destination_id") + groupExprs = append(groupExprs, "destination_id") + order = append(order, sfDestID) + case "topic": + selectExprs = append(selectExprs, "topic") + groupExprs = append(groupExprs, "topic") + order = append(order, sfTopic) + case "status": + selectExprs = append(selectExprs, "status") + groupExprs = append(groupExprs, "status") + order = append(order, sfStatus) + case "code": + selectExprs = append(selectExprs, "COALESCE(code, '')") + groupExprs = append(groupExprs, "code") + order = append(order, sfCode) + case "manual": + selectExprs = append(selectExprs, "manual") + groupExprs = append(groupExprs, "manual") + order = append(order, sfManual) + case "attempt_number": + selectExprs = append(selectExprs, "attempt_number") + groupExprs = append(groupExprs, "attempt_number") + order = append(order, sfAttemptNumber) + } + } + + // Measures + for _, measure := range req.Measures { + switch measure { + case "count": + selectExprs = append(selectExprs, "COUNT(*)") + order = append(order, sfCount) + case "successful_count": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE status = 'success')") + order = append(order, sfSuccessCount) + case "failed_count": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE status = 'failed')") + order = append(order, sfFailedCount) + case "error_rate": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE status = 'failed')::float8 / COUNT(*)") + order = append(order, sfErrorRate) + case "first_attempt_count": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE attempt_number = 1 AND NOT manual)") + order = append(order, sfFirstAttempt) + case "retry_count": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE attempt_number > 1)") + order = append(order, sfRetryCount) + case "manual_retry_count": + selectExprs = append(selectExprs, "COUNT(*) FILTER (WHERE manual = true)") + order = append(order, sfManualRetry) + case "avg_attempt_number": + selectExprs = append(selectExprs, "AVG(attempt_number)::float8") + order = append(order, sfAvgAttemptNum) + } + } + + // WHERE + if tenantIDs, ok := req.Filters["tenant_id"]; ok { + conditions = append(conditions, "tenant_id = ANY("+arg(tenantIDs)+")") + } + conditions = append(conditions, "time >= "+arg(req.TimeRange.Start)) + conditions = append(conditions, "time < "+arg(req.TimeRange.End)) + + if statuses, ok := req.Filters["status"]; ok { + conditions = append(conditions, "status = ANY("+arg(statuses)+")") + } + if dests, ok := req.Filters["destination_id"]; ok { + conditions = append(conditions, "destination_id = ANY("+arg(dests)+")") + } + if topics, ok := req.Filters["topic"]; ok { + conditions = append(conditions, "topic = ANY("+arg(topics)+")") + } + if codes, ok := req.Filters["code"]; ok { + conditions = append(conditions, "code = ANY("+arg(codes)+")") + } + if manuals, ok := req.Filters["manual"]; ok { + conditions = append(conditions, "manual = ANY("+arg(manuals)+"::boolean[])") + } + if attemptNums, ok := req.Filters["attempt_number"]; ok { + conditions = append(conditions, "attempt_number = ANY("+arg(attemptNums)+"::integer[])") + } + + // Build SQL + query := "SELECT " + strings.Join(selectExprs, ", ") + + " FROM attempts WHERE " + strings.Join(conditions, " AND ") + if len(groupExprs) > 0 { + query += " GROUP BY " + strings.Join(groupExprs, ", ") + } + query += " HAVING COUNT(*) > 0" + if len(groupExprs) > 0 { + query += " ORDER BY " + strings.Join(groupExprs, ", ") + } + query += fmt.Sprintf(" LIMIT %d", defaultRowLimit+1) + + rows, err := s.db.Query(ctx, query, args...) + if err != nil { + return nil, fmt.Errorf("query attempt metrics: %w", err) + } + defer rows.Close() + + var ( + tbVal time.Time + tenantIDVal string + destIDVal string + topicVal string + statusVal string + codeVal string + manualVal bool + attemptNumberVal int + countVal int + successCount int + failedCount int + errorRate float64 + firstAttempt int + retryCount int + manualRetry int + avgAttemptNum float64 + ) + + scanDests := make([]any, len(order)) + for i, f := range order { + switch f { + case sfTimeBucket: + scanDests[i] = &tbVal + case sfTenantID: + scanDests[i] = &tenantIDVal + case sfDestID: + scanDests[i] = &destIDVal + case sfTopic: + scanDests[i] = &topicVal + case sfStatus: + scanDests[i] = &statusVal + case sfCode: + scanDests[i] = &codeVal + case sfManual: + scanDests[i] = &manualVal + case sfAttemptNumber: + scanDests[i] = &attemptNumberVal + case sfCount: + scanDests[i] = &countVal + case sfSuccessCount: + scanDests[i] = &successCount + case sfFailedCount: + scanDests[i] = &failedCount + case sfErrorRate: + scanDests[i] = &errorRate + case sfFirstAttempt: + scanDests[i] = &firstAttempt + case sfRetryCount: + scanDests[i] = &retryCount + case sfManualRetry: + scanDests[i] = &manualRetry + case sfAvgAttemptNum: + scanDests[i] = &avgAttemptNum + } + } + + data := []driver.AttemptMetricsDataPoint{} + for rows.Next() { + if err := rows.Scan(scanDests...); err != nil { + return nil, fmt.Errorf("scan attempt metrics: %w", err) + } + + dp := driver.AttemptMetricsDataPoint{} + for _, f := range order { + switch f { + case sfTimeBucket: + t := tbVal.UTC() + dp.TimeBucket = &t + case sfTenantID: + v := tenantIDVal + dp.TenantID = &v + case sfDestID: + v := destIDVal + dp.DestinationID = &v + case sfTopic: + v := topicVal + dp.Topic = &v + case sfStatus: + v := statusVal + dp.Status = &v + case sfCode: + v := codeVal + dp.Code = &v + case sfManual: + v := manualVal + dp.Manual = &v + case sfAttemptNumber: + v := attemptNumberVal + dp.AttemptNumber = &v + case sfCount: + v := countVal + dp.Count = &v + case sfSuccessCount: + v := successCount + dp.SuccessfulCount = &v + case sfFailedCount: + v := failedCount + dp.FailedCount = &v + case sfErrorRate: + v := errorRate + dp.ErrorRate = &v + case sfFirstAttempt: + v := firstAttempt + dp.FirstAttemptCount = &v + case sfRetryCount: + v := retryCount + dp.RetryCount = &v + case sfManualRetry: + v := manualRetry + dp.ManualRetryCount = &v + case sfAvgAttemptNum: + v := avgAttemptNum + dp.AvgAttemptNumber = &v + } + } + data = append(data, dp) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("rows error: %w", err) + } + + truncated := len(data) > defaultRowLimit + if truncated { + data = data[:defaultRowLimit] + } + + data, err = bucket.FillAttemptBuckets(data, req) + if err != nil { + return nil, fmt.Errorf("fill attempt buckets: %w: %w", driver.ErrResourceLimit, err) + } + driver.ComputeAttemptRates(data, req) + + elapsed := time.Since(start) + return &driver.AttemptMetricsResponse{ + Data: data, + Metadata: driver.MetricsMetadata{ + QueryTimeMs: elapsed.Milliseconds(), + RowCount: len(data), + RowLimit: defaultRowLimit, + Truncated: truncated, + }, + }, nil +} diff --git a/internal/portal/package-lock.json b/internal/portal/package-lock.json index ef6bd22a3..a66fbed8d 100644 --- a/internal/portal/package-lock.json +++ b/internal/portal/package-lock.json @@ -15,6 +15,7 @@ "react-dom": "^18.3.1", "react-markdown": "^9.0.3", "react-router-dom": "^6.28.0", + "recharts": "^3.7.0", "swr": "^2.2.5" }, "devDependencies": { @@ -2053,6 +2054,42 @@ "resolved": "https://registry.npmjs.org/@radix-ui/rect/-/rect-1.1.0.tgz", "integrity": "sha512-A9+lCBZoaMJlVKcRBz2YByCG+Cp2t6nAnMnNba+XiWxnj6r4JUFqfsgwocMBZU9LPtdxC6wB56ySYpc7LQIoJg==" }, + "node_modules/@reduxjs/toolkit": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/@reduxjs/toolkit/-/toolkit-2.11.2.tgz", + "integrity": "sha512-Kd6kAHTA6/nUpp8mySPqj3en3dm0tdMIgbttnQ1xFMVpufoj+ADi8pXLBsd4xzTRHQa7t/Jv8W5UnCuW4kuWMQ==", + "license": "MIT", + "dependencies": { + "@standard-schema/spec": "^1.0.0", + "@standard-schema/utils": "^0.3.0", + "immer": "^11.0.0", + "redux": "^5.0.1", + "redux-thunk": "^3.1.0", + "reselect": "^5.1.0" + }, + "peerDependencies": { + "react": "^16.9.0 || ^17.0.0 || ^18 || ^19", + "react-redux": "^7.2.1 || ^8.1.3 || ^9.0.0" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + }, + "react-redux": { + "optional": true + } + } + }, + "node_modules/@reduxjs/toolkit/node_modules/immer": { + "version": "11.1.4", + "resolved": "https://registry.npmjs.org/immer/-/immer-11.1.4.tgz", + "integrity": "sha512-XREFCPo6ksxVzP4E0ekD5aMdf8WMwmdNaz6vuvxgI40UaEiu6q3p8X52aU6GdyvLY3XXX/8R7JOTXStz/nBbRw==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/immer" + } + }, "node_modules/@remix-run/router": { "version": "1.21.0", "resolved": "https://registry.npmjs.org/@remix-run/router/-/router-1.21.0.tgz", @@ -2562,6 +2599,18 @@ "node": ">= 14" } }, + "node_modules/@standard-schema/spec": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", + "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", + "license": "MIT" + }, + "node_modules/@standard-schema/utils": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/@standard-schema/utils/-/utils-0.3.0.tgz", + "integrity": "sha512-e7Mew686owMaPJVNNLs55PUvgz371nKgwsc4vxE49zsODpJEnxgxRo2y/OKrqueavXgZNMDVj3DdHFlaSAeU8g==", + "license": "MIT" + }, "node_modules/@swc/core": { "version": "1.7.26", "resolved": "https://registry.npmjs.org/@swc/core/-/core-1.7.26.tgz", @@ -2795,6 +2844,69 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/d3-array": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@types/d3-array/-/d3-array-3.2.2.tgz", + "integrity": "sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==", + "license": "MIT" + }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "license": "MIT" + }, + "node_modules/@types/d3-ease": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-ease/-/d3-ease-3.0.2.tgz", + "integrity": "sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==", + "license": "MIT" + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-path": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@types/d3-path/-/d3-path-3.1.1.tgz", + "integrity": "sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==", + "license": "MIT" + }, + "node_modules/@types/d3-scale": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-scale/-/d3-scale-4.0.9.tgz", + "integrity": "sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==", + "license": "MIT", + "dependencies": { + "@types/d3-time": "*" + } + }, + "node_modules/@types/d3-shape": { + "version": "3.1.8", + "resolved": "https://registry.npmjs.org/@types/d3-shape/-/d3-shape-3.1.8.tgz", + "integrity": "sha512-lae0iWfcDeR7qt7rA88BNiqdvPS5pFVPpo5OfjElwNaT2yyekbM0C9vK+yqBqEmHr6lDkRnYNoTBYlAgJa7a4w==", + "license": "MIT", + "dependencies": { + "@types/d3-path": "*" + } + }, + "node_modules/@types/d3-time": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.4.tgz", + "integrity": "sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==", + "license": "MIT" + }, + "node_modules/@types/d3-timer": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", + "integrity": "sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==", + "license": "MIT" + }, "node_modules/@types/debug": { "version": "4.1.12", "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", @@ -2892,6 +3004,12 @@ "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "license": "MIT" }, + "node_modules/@types/use-sync-external-store": { + "version": "0.0.6", + "resolved": "https://registry.npmjs.org/@types/use-sync-external-store/-/use-sync-external-store-0.0.6.tgz", + "integrity": "sha512-zFDAD+tlpf2r4asuHEj0XH6pY6i0g5NeAHPn+15wk3BV6JA69eERFXC1gyGThDkVa1zCyKr5jox1+2LbV/AMLg==", + "license": "MIT" + }, "node_modules/@typescript-eslint/eslint-plugin": { "version": "8.50.0", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.50.0.tgz", @@ -3495,6 +3613,15 @@ "resolved": "https://registry.npmjs.org/client-only/-/client-only-0.0.1.tgz", "integrity": "sha512-IV3Ou0jSMzZrd3pZ48nLkT9DA7Ag1pnPzaiQhpW7c3RbcqqzvzzVu+L8gfqMp/8IM2MQtSiqaCxrrcfu8I8rMA==" }, + "node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/color-convert": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", @@ -3565,6 +3692,127 @@ "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==", "license": "MIT" }, + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "license": "ISC", + "dependencies": { + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-format": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz", + "integrity": "sha512-AJDdYOdnyRDV5b6ArilzCPPwc1ejkHcoyFarqlPqT7zRYjhavcT3uSrqcMvsgh2CgoPbK3RCwyHaVyxYcP2Arg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz", + "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-scale": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz", + "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", + "license": "ISC", + "dependencies": { + "d3-array": "2.10.0 - 3", + "d3-format": "1 - 3", + "d3-interpolate": "1.2.0 - 3", + "d3-time": "2.1.1 - 3", + "d3-time-format": "2 - 4" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-shape": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-3.2.0.tgz", + "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", + "license": "ISC", + "dependencies": { + "d3-path": "^3.1.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-time/-/d3-time-3.1.0.tgz", + "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "license": "ISC", + "dependencies": { + "d3-array": "2 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-time-format": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/d3-time-format/-/d3-time-format-4.1.0.tgz", + "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", + "license": "ISC", + "dependencies": { + "d3-time": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, "node_modules/debug": { "version": "4.4.0", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.0.tgz", @@ -3582,6 +3830,12 @@ } } }, + "node_modules/decimal.js-light": { + "version": "2.5.1", + "resolved": "https://registry.npmjs.org/decimal.js-light/-/decimal.js-light-2.5.1.tgz", + "integrity": "sha512-qIMFpTMZmny+MMIitAB6D7iVPEorVw6YQRWkvarTkT4tBeSLLiHzcwj6q0MmYSFCiVpiqPJTJEYIrpcPzVEIvg==", + "license": "MIT" + }, "node_modules/decode-named-character-reference": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.0.2.tgz", @@ -3648,6 +3902,16 @@ "integrity": "sha512-dB68l59BI75W1BUGVTAEJy45CEVuEGy9qPVVQ8pnHyHMn36PLPPoE1mjLH+lo9rKulO3HC2OhbACI/8tCqJBcA==", "license": "ISC" }, + "node_modules/es-toolkit": { + "version": "1.45.1", + "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.45.1.tgz", + "integrity": "sha512-/jhoOj/Fx+A+IIyDNOvO3TItGmlMKhtX8ISAHKE90c4b/k1tqaqEZ+uUqfpU8DMnW5cgNJv606zS55jGvza0Xw==", + "license": "MIT", + "workspaces": [ + "docs", + "benchmarks" + ] + }, "node_modules/esbuild": { "version": "0.24.0", "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.24.0.tgz", @@ -3947,6 +4211,12 @@ "node": ">=0.10.0" } }, + "node_modules/eventemitter3": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.4.tgz", + "integrity": "sha512-mlsTRyGaPBjPedk6Bvw+aqbsXDtoAyAzm5MO7JgU+yVRyMQ5O8bD4Kcci7BS85f93veegeCPkL8R4GLClnjLFw==", + "license": "MIT" + }, "node_modules/extend": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", @@ -4221,6 +4491,16 @@ "node": ">= 4" } }, + "node_modules/immer": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/immer/-/immer-10.2.0.tgz", + "integrity": "sha512-d/+XTN3zfODyjr89gM3mPq1WNX2B8pYsu7eORitdwyA2sBubnTl3laYlBk4sXY5FUa5qTZGBDPJICVbvqzjlbw==", + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/immer" + } + }, "node_modules/immutable": { "version": "5.0.2", "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.2.tgz", @@ -4260,6 +4540,15 @@ "integrity": "sha512-0aO8FkhNZlj/ZIbNi7Lxxr12obT7cL1moPfE4tg1LkX7LlLfC6DeX4l2ZEud1ukP9jNQyNnfzQVqwbwmAATY4Q==", "license": "MIT" }, + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", + "license": "ISC", + "engines": { + "node": ">=12" + } + }, "node_modules/is-alphabetical": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-alphabetical/-/is-alphabetical-2.0.1.tgz", @@ -5504,6 +5793,29 @@ "react": ">=18" } }, + "node_modules/react-redux": { + "version": "9.2.0", + "resolved": "https://registry.npmjs.org/react-redux/-/react-redux-9.2.0.tgz", + "integrity": "sha512-ROY9fvHhwOD9ySfrF0wmvu//bKCQ6AeZZq1nJNtbDC+kk5DuSuNX/n6YWYF/SYy7bSba4D4FSz8DJeKY/S/r+g==", + "license": "MIT", + "dependencies": { + "@types/use-sync-external-store": "^0.0.6", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "@types/react": "^18.2.25 || ^19", + "react": "^18.0 || ^19", + "redux": "^5.0.0" + }, + "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, + "redux": { + "optional": true + } + } + }, "node_modules/react-remove-scroll": { "version": "2.6.2", "resolved": "https://registry.npmjs.org/react-remove-scroll/-/react-remove-scroll-2.6.2.tgz", @@ -5615,6 +5927,51 @@ "node": ">=8.10.0" } }, + "node_modules/recharts": { + "version": "3.7.0", + "resolved": "https://registry.npmjs.org/recharts/-/recharts-3.7.0.tgz", + "integrity": "sha512-l2VCsy3XXeraxIID9fx23eCb6iCBsxUQDnE8tWm6DFdszVAO7WVY/ChAD9wVit01y6B2PMupYiMmQwhgPHc9Ew==", + "license": "MIT", + "workspaces": [ + "www" + ], + "dependencies": { + "@reduxjs/toolkit": "1.x.x || 2.x.x", + "clsx": "^2.1.1", + "decimal.js-light": "^2.5.1", + "es-toolkit": "^1.39.3", + "eventemitter3": "^5.0.1", + "immer": "^10.1.1", + "react-redux": "8.x.x || 9.x.x", + "reselect": "5.1.1", + "tiny-invariant": "^1.3.3", + "use-sync-external-store": "^1.2.2", + "victory-vendor": "^37.0.2" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.0.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-is": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/redux": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/redux/-/redux-5.0.1.tgz", + "integrity": "sha512-M9/ELqF6fy8FwmkpnF0S3YKOqMyoWJ4+CS5Efg2ct3oY9daQvd/Pc71FpGZsVsbl3Cpb+IIcjBDUnnyBdQbq4w==", + "license": "MIT" + }, + "node_modules/redux-thunk": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/redux-thunk/-/redux-thunk-3.1.0.tgz", + "integrity": "sha512-NW2r5T6ksUKXCabzhL9z+h206HQw/NJkcLm1GPImRQ8IzfXwRGqjVhKJGauHirT0DAuyy6hjdnMZaRoAcy0Klw==", + "license": "MIT", + "peerDependencies": { + "redux": "^5.0.0" + } + }, "node_modules/remark-parse": { "version": "11.0.0", "resolved": "https://registry.npmjs.org/remark-parse/-/remark-parse-11.0.0.tgz", @@ -5648,6 +6005,12 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/reselect": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/reselect/-/reselect-5.1.1.tgz", + "integrity": "sha512-K/BG6eIky/SBpzfHZv/dd+9JBFiS4SWV7FIujVyJRux6e45+73RaUHXLmIR1f7WOMaQ0U1km6qwklRQxpJJY0w==", + "license": "MIT" + }, "node_modules/resolve-from": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", @@ -6233,6 +6596,12 @@ "node": ">=16.0.0" } }, + "node_modules/tiny-invariant": { + "version": "1.3.3", + "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz", + "integrity": "sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==", + "license": "MIT" + }, "node_modules/tinyglobby": { "version": "0.2.15", "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", @@ -6578,11 +6947,12 @@ } }, "node_modules/use-sync-external-store": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.2.2.tgz", - "integrity": "sha512-PElTlVMwpblvbNqQ82d2n6RjStvdSoNe9FG28kNfz3WiXilJm4DdNkEzRhCZuIDwY8U08WVihhGR5iRqAwfDiw==", + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.6.0.tgz", + "integrity": "sha512-Pp6GSwGP/NrPIrxVFAIkOQeyw8lFenOHijQWkUTrDvrF4ALqylP2C/KCkeS9dpUM3KvYRQhna5vt7IL95+ZQ9w==", + "license": "MIT", "peerDependencies": { - "react": "^16.8.0 || ^17.0.0 || ^18.0.0" + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" } }, "node_modules/varint": { @@ -6619,6 +6989,28 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/victory-vendor": { + "version": "37.3.6", + "resolved": "https://registry.npmjs.org/victory-vendor/-/victory-vendor-37.3.6.tgz", + "integrity": "sha512-SbPDPdDBYp+5MJHhBCAyI7wKM3d5ivekigc2Dk2s7pgbZ9wIgIBYGVw4zGHBml/qTFbexrofXW6Gu4noGxrOwQ==", + "license": "MIT AND ISC", + "dependencies": { + "@types/d3-array": "^3.0.3", + "@types/d3-ease": "^3.0.0", + "@types/d3-interpolate": "^3.0.1", + "@types/d3-scale": "^4.0.2", + "@types/d3-shape": "^3.1.0", + "@types/d3-time": "^3.0.0", + "@types/d3-timer": "^3.0.0", + "d3-array": "^3.1.6", + "d3-ease": "^3.0.1", + "d3-interpolate": "^3.0.1", + "d3-scale": "^4.0.2", + "d3-shape": "^3.1.0", + "d3-time": "^3.0.0", + "d3-timer": "^3.0.1" + } + }, "node_modules/vite": { "version": "6.0.3", "resolved": "https://registry.npmjs.org/vite/-/vite-6.0.3.tgz", diff --git a/internal/portal/package.json b/internal/portal/package.json index 544d6931f..e3d881f57 100644 --- a/internal/portal/package.json +++ b/internal/portal/package.json @@ -18,6 +18,7 @@ "react-dom": "^18.3.1", "react-markdown": "^9.0.3", "react-router-dom": "^6.28.0", + "recharts": "^3.7.0", "swr": "^2.2.5" }, "devDependencies": { diff --git a/internal/portal/src/common/MetricsChart/MetricsBreakdown.scss b/internal/portal/src/common/MetricsChart/MetricsBreakdown.scss new file mode 100644 index 000000000..48e5489ce --- /dev/null +++ b/internal/portal/src/common/MetricsChart/MetricsBreakdown.scss @@ -0,0 +1,117 @@ +.metrics-breakdown { + display: flex; + flex-direction: column; + height: 100%; + + &__header { + display: flex; + align-items: baseline; + justify-content: space-between; + margin-bottom: var(--spacing-4); + } + + &__title { + font-weight: 500; + font-size: 14px; + line-height: 20px; + color: var(--colors-foreground-neutral); + } + + &__subtitle { + font-weight: 400; + color: var(--colors-foreground-neutral-3); + } + + &__rows { + flex: 1; + min-height: 0; + overflow-y: auto; + display: flex; + flex-direction: column; + gap: var(--spacing-3); + } + + &__row { + display: flex; + align-items: center; + gap: var(--spacing-3); + } + + &__dimension { + font-size: 13px; + font-family: var(--font-family-monospace); + color: var(--colors-foreground-neutral); + min-width: 100px; + max-width: 100px; + white-space: nowrap; + overflow: hidden; + text-overflow: ellipsis; + } + + &__bar-container { + flex: 1; + height: 8px; + background: var(--colors-background-neutral-2); + border-radius: 4px; + overflow: hidden; + } + + &__bar { + height: 100%; + background: var(--colors-dataviz-info); + border-radius: 4px; + transition: width 0.2s ease; + } + + &__bar--success { + background: var(--colors-dataviz-success); + } + + &__bar--error { + background: var(--colors-dataviz-error); + } + + &__column-headers { + display: flex; + justify-content: flex-end; + gap: var(--spacing-3); + font-size: 11px; + color: var(--colors-foreground-neutral-3); + margin-bottom: var(--spacing-1); + + span:first-child { + min-width: 40px; + text-align: right; + } + + span:last-child { + min-width: 48px; + text-align: right; + } + } + + &__count { + font-size: 13px; + font-family: var(--font-family-monospace); + color: var(--colors-foreground-neutral-2); + min-width: 40px; + text-align: right; + } + + &__rate { + font-size: 13px; + color: var(--colors-foreground-neutral-3); + min-width: 48px; + text-align: right; + } + + &__empty, + &__loading { + display: flex; + align-items: center; + justify-content: center; + flex: 1; + color: var(--colors-foreground-neutral-3); + font-size: 14px; + } +} diff --git a/internal/portal/src/common/MetricsChart/MetricsBreakdown.tsx b/internal/portal/src/common/MetricsChart/MetricsBreakdown.tsx new file mode 100644 index 000000000..f35f533e9 --- /dev/null +++ b/internal/portal/src/common/MetricsChart/MetricsBreakdown.tsx @@ -0,0 +1,134 @@ +import "./MetricsBreakdown.scss"; + +import { Loading } from "../Icons"; +import { MetricsDataPoint } from "./useMetrics"; + +const VARIANT_CSS_VARS: Record = { + success: "var(--colors-dataviz-success)", + error: "var(--colors-dataviz-error)", +}; + +function BreakdownBar({ width, variant }: { width: number; variant?: string }) { + const bg = variant ? VARIANT_CSS_VARS[variant] : undefined; + + return ( +
+ ); +} + +interface BreakdownRow { + dimension: string; + count: number; + error_rate?: number; +} + +interface MetricsBreakdownProps { + title: string; + subtitle?: string; + data: MetricsDataPoint[] | undefined; + dimensionKey: string; + loading: boolean; + error: boolean; + showErrorRate?: boolean; + barColor?: (dimension: string) => string | undefined; +} + +function toRows( + data: MetricsDataPoint[] | undefined, + dimension_key: string, +): BreakdownRow[] { + if (!data) return []; + return data + .map((d) => ({ + dimension: d.dimensions[dimension_key] ?? "unknown", + count: d.metrics.count ?? 0, + error_rate: d.metrics.error_rate, + })) + .sort((a, b) => b.count - a.count); +} + +const MetricsBreakdown: React.FC = ({ + title, + subtitle, + data, + dimensionKey, + loading, + error, + showErrorRate, + barColor, +}) => { + const rows = toRows(data, dimensionKey); + const max_count = rows.reduce((max, r) => Math.max(max, r.count), 0); + + const renderBody = () => { + if (loading) { + return ( +
+ +
+ ); + } + + if (error) { + return ( +
Failed to load metrics
+ ); + } + + if (rows.length === 0) { + return
No data
; + } + + return ( +
+ {rows.map((row) => ( +
+ + {row.dimension} + +
+ 0 ? (row.count / max_count) * 100 : 0} + variant={barColor?.(row.dimension)} + /> +
+ {row.count} + {showErrorRate && row.error_rate != null && ( + + {(row.error_rate * 100).toFixed(1)}% + + )} +
+ ))} +
+ ); + }; + + return ( +
+
+ + {title} + {subtitle && ( + / {subtitle} + )} + + {showErrorRate && ( +
+ Count + Error % +
+ )} +
+ {renderBody()} +
+ ); +}; + +export default MetricsBreakdown; diff --git a/internal/portal/src/common/MetricsChart/MetricsChart.scss b/internal/portal/src/common/MetricsChart/MetricsChart.scss new file mode 100644 index 000000000..e0d2a2495 --- /dev/null +++ b/internal/portal/src/common/MetricsChart/MetricsChart.scss @@ -0,0 +1,44 @@ +.metrics-chart { + display: flex; + flex-direction: column; + height: 100%; + + &__header { + margin-bottom: var(--spacing-4); + } + + &__title { + font-weight: 500; + font-size: 14px; + line-height: 20px; + color: var(--colors-foreground-neutral); + } + + &__subtitle { + font-weight: 400; + color: var(--colors-foreground-neutral-3); + text-transform: capitalize; + } + + &__body { + flex: 1; + min-height: 0; + } + + &__empty, + &__error { + display: flex; + align-items: center; + justify-content: center; + height: 100%; + color: var(--colors-foreground-neutral-3); + font-size: 14px; + } + + &__loading { + display: flex; + align-items: center; + justify-content: center; + height: 100%; + } +} diff --git a/internal/portal/src/common/MetricsChart/MetricsChart.tsx b/internal/portal/src/common/MetricsChart/MetricsChart.tsx new file mode 100644 index 000000000..0330db7d6 --- /dev/null +++ b/internal/portal/src/common/MetricsChart/MetricsChart.tsx @@ -0,0 +1,267 @@ +import "./MetricsChart.scss"; + +import { useEffect, useRef, useState } from "react"; +import { + BarChart, + Bar, + LineChart, + Line, + XAxis, + YAxis, + CartesianGrid, + Tooltip, +} from "recharts"; +import { Loading } from "../Icons"; + +// Replaces recharts' ResponsiveContainer which fires its internal ResizeObserver +// before flex layout resolves, measuring -1px and spamming console warnings. +// This hook waits for positive dimensions before we render the chart. +function useContainerSize(ref: React.RefObject) { + const [size, setSize] = useState({ width: 0, height: 0 }); + + useEffect(() => { + const el = ref.current; + if (!el) return; + + const observer = new ResizeObserver((entries) => { + const entry = entries[0]; + if (entry) { + const { width, height } = entry.contentRect; + setSize((prev) => + prev.width === width && prev.height === height + ? prev + : { width, height }, + ); + } + }); + + observer.observe(el); + return () => observer.disconnect(); + }, [ref]); + + return size; +} + +export type ChartDataPoint = Record & { + label: string; +}; + +export type ChartSeries = { + key: string; + label: string; + cssVar: string; +}; + +interface MetricsChartProps { + title: string; + subtitle?: string; + data: ChartDataPoint[]; + series: ChartSeries[]; + type: "bar" | "stacked-bar" | "line" | "multi-line"; + loading: boolean; + error: boolean; + yDomain?: [number, number]; + yTickFormatter?: (v: number) => string; + yAllowDecimals?: boolean; + tooltipFormatter?: (value: any, name: any) => [string, string]; +} + +function getYAxisWidth( + data: ChartDataPoint[], + series: ChartSeries[], + formatter?: (v: number) => string, +): number { + if (formatter) return 48; + let max_value = 0; + for (const d of data) { + for (const s of series) { + const v = Number(d[s.key] ?? 0); + if (v > max_value) max_value = v; + } + } + return 32 + String(Math.floor(max_value)).length * 8; +} + +const CHART_MARGIN = { top: 0, right: 0, bottom: 0, left: 0 }; + +const TOOLTIP_STYLE = { + background: "var(--colors-background)", + border: "1px solid var(--colors-outline-neutral)", + borderRadius: 6, + fontSize: 13, +}; + +const MetricsChart: React.FC = ({ + title, + subtitle, + data, + series, + type, + loading, + error, + yDomain, + yTickFormatter, + yAllowDecimals, + tooltipFormatter, +}) => { + const colorRef = useRef(null); + const bodyRef = useRef(null); + const [colors, setColors] = useState>({}); + const { width: chartWidth, height: chartHeight } = useContainerSize(bodyRef); + const css_vars_key = series.map((s) => s.cssVar).join(","); + + useEffect(() => { + if (colorRef.current) { + const style = getComputedStyle(colorRef.current); + const resolved: Record = {}; + for (const v of css_vars_key.split(",")) { + const val = style.getPropertyValue(v).trim(); + if (val) resolved[v] = val; + } + setColors(resolved); + } + }, [css_vars_key]); + + const getColor = (css_var: string) => + colors[css_var] ?? "var(--colors-background-neutral-1)"; + + const renderBody = () => { + if (loading) { + return ( +
+ +
+ ); + } + + if (error) { + return
Failed to load metrics
; + } + + if (data.length === 0) { + return
No data
; + } + + if (chartWidth <= 0 || chartHeight <= 0) { + return null; + } + + const tick_style = { + fontSize: 12, + fill: "var(--colors-foreground-neutral-2)", + }; + + const x_axis_props = { + dataKey: "label" as const, + stroke: "var(--colors-outline-neutral)", + tick: tick_style, + tickMargin: 8, + minTickGap: 16, + }; + + const y_axis_props = { + width: getYAxisWidth(data, series, yTickFormatter), + allowDecimals: yAllowDecimals ?? false, + stroke: "transparent", + tick: tick_style, + tickMargin: 8, + minTickGap: 24, + ...(yDomain ? { domain: yDomain } : {}), + ...(yTickFormatter ? { tickFormatter: yTickFormatter } : {}), + }; + + if (type === "bar" || type === "stacked-bar") { + return ( + + + + + label} + formatter={ + tooltipFormatter ?? + ((value: any, name: any) => [String(value), String(name)]) + } + /> + {series.map((s) => ( + + ))} + + ); + } + + return ( + + + + + label} + formatter={ + tooltipFormatter ?? + ((value: any, name: any) => [String(value), String(name)]) + } + /> + {series.map((s) => ( + + ))} + + ); + }; + + return ( +
+
+ + {title} + {subtitle && ( + / {subtitle} + )} + +
+
+ {renderBody()} +
+
+ ); +}; + +export default MetricsChart; diff --git a/internal/portal/src/common/MetricsChart/useMetrics.ts b/internal/portal/src/common/MetricsChart/useMetrics.ts new file mode 100644 index 000000000..803615499 --- /dev/null +++ b/internal/portal/src/common/MetricsChart/useMetrics.ts @@ -0,0 +1,139 @@ +import { useContext, useMemo } from "react"; +import useSWR from "swr"; +import { ApiContext } from "../../app"; + +export type Timeframe = "1h" | "24h" | "7d" | "30d"; + +export type MetricsDataPoint = { + time_bucket: string; + dimensions: Record; + metrics: Record; +}; + +export type MetricsResponse = { + data: MetricsDataPoint[]; + metadata: { + granularity: string; + }; +}; + +// Round down to the nearest minute so the SWR key stays stable across renders +function roundToMinute(date: Date): Date { + const d = new Date(date); + d.setSeconds(0, 0); + return d; +} + +function getDateRange(timeframe: Timeframe) { + const now = roundToMinute(new Date()); + const end = now.toISOString(); + const start = new Date(now); + + switch (timeframe) { + case "1h": + start.setHours(start.getHours() - 1); + break; + case "24h": + start.setHours(start.getHours() - 24); + break; + case "7d": + start.setDate(start.getDate() - 7); + break; + case "30d": + start.setDate(start.getDate() - 30); + break; + } + + return { start: start.toISOString(), end }; +} + +function getGranularity(timeframe: Timeframe) { + switch (timeframe) { + case "1h": + return "1m"; + case "24h": + return "30m"; + case "7d": + return "3h"; + case "30d": + return "12h"; + } +} + +export function useMetrics({ + measures, + destinationId, + timeframe, + dimensions, + filters, + granularity: granularityOverride, +}: { + measures: string[]; + destinationId: string; + timeframe: Timeframe; + dimensions?: string[]; + filters?: Record; + granularity?: string; +}) { + const apiClient = useContext(ApiContext); + + // Stable keys for useMemo deps + const measuresKey = measures.join(","); + const dimensionsKey = dimensions?.join(",") ?? ""; + const filtersKey = filters + ? Object.entries(filters) + .map(([k, v]) => `${k}=${v}`) + .join(",") + : ""; + + const url = useMemo(() => { + const { start, end } = getDateRange(timeframe); + + const params = new URLSearchParams(); + params.set("time[start]", start); + params.set("time[end]", end); + params.set("filters[destination_id]", destinationId); + for (const m of measures) { + params.append("measures[]", m); + } + + if (filters) { + for (const [k, v] of Object.entries(filters)) { + params.set(`filters[${k}]`, v); + } + } + + if (dimensions && dimensions.length > 0) { + // Aggregate query — no granularity + for (const d of dimensions) { + params.append("dimensions[]", d); + } + } else { + // Time-series query — include granularity + params.set( + "granularity", + granularityOverride ?? getGranularity(timeframe), + ); + } + + return `metrics/attempts?${params.toString()}`; + }, [ + measuresKey, + dimensionsKey, + filtersKey, + granularityOverride, + destinationId, + timeframe, + ]); + + const { data, error, isLoading } = useSWR( + url, + (path: string) => apiClient.fetchRoot(path), + { + refreshInterval: 60_000, + revalidateOnFocus: false, + }, + ); + + return { data, error, isLoading }; +} diff --git a/internal/portal/src/common/Sparkline/Sparkline.scss b/internal/portal/src/common/Sparkline/Sparkline.scss new file mode 100644 index 000000000..850dbe84b --- /dev/null +++ b/internal/portal/src/common/Sparkline/Sparkline.scss @@ -0,0 +1,31 @@ +.sparkline { + display: flex; + align-items: flex-end; + gap: 2px; + height: 16px; + + &__bar { + width: 6px; + display: flex; + flex-direction: column; + justify-content: flex-end; + } + + &__segment { + width: 100%; + border-radius: 1px; + + &--success { + background: var(--colors-dataviz-success); + } + + &--error { + background: var(--colors-dataviz-error); + } + + &--empty { + height: 2px; + background: var(--colors-outline-neutral); + } + } +} diff --git a/internal/portal/src/common/Sparkline/Sparkline.tsx b/internal/portal/src/common/Sparkline/Sparkline.tsx new file mode 100644 index 000000000..35dcb1475 --- /dev/null +++ b/internal/portal/src/common/Sparkline/Sparkline.tsx @@ -0,0 +1,53 @@ +import "./Sparkline.scss"; + +interface SparklineDataPoint { + successful: number; + failed: number; +} + +interface SparklineProps { + data: SparklineDataPoint[]; +} + +const Sparkline: React.FC = ({ data }) => { + const max = data.reduce((m, d) => Math.max(m, d.successful + d.failed), 0); + + return ( +
+ {data.map((d, i) => { + const total = d.successful + d.failed; + + if (total === 0) { + return ( +
+
+
+ ); + } + + const height = max > 0 ? (total / max) * 100 : 0; + const failPct = (d.failed / total) * 100; + const successPct = 100 - failPct; + + return ( +
+ {d.failed > 0 && ( +
+ )} + {d.successful > 0 && ( +
+ )} +
+ ); + })} +
+ ); +}; + +export default Sparkline; diff --git a/internal/portal/src/global.scss b/internal/portal/src/global.scss index 3404206d2..e6f9414f8 100644 --- a/internal/portal/src/global.scss +++ b/internal/portal/src/global.scss @@ -189,6 +189,12 @@ $COLORS_DATAVIZ_SUCCESS_DARK: #004d26 !default; $COLORS_DATAVIZ_ERROR_LIGHT: #cc2314 !default; $COLORS_DATAVIZ_ERROR_DARK: #d64f43 !default; +$COLORS_DATAVIZ_INFO_LIGHT: #99b8f5 !default; +$COLORS_DATAVIZ_INFO_DARK: #5599dd !default; + +$COLORS_DATAVIZ_WARNING_LIGHT: #cc8800 !default; +$COLORS_DATAVIZ_WARNING_DARK: #dd9933 !default; + :root { --base-font-size: #{$BASE_FONT_SIZE}; --base-grid-multiplier: #{$BASE_GRID_MULTIPLIER}; @@ -243,6 +249,8 @@ $COLORS_DATAVIZ_ERROR_DARK: #d64f43 !default; --colors-dataviz-error: #{$COLORS_DATAVIZ_ERROR_LIGHT}; --colors-dataviz-success: #{$COLORS_DATAVIZ_SUCCESS_LIGHT}; + --colors-dataviz-info: #{$COLORS_DATAVIZ_INFO_LIGHT}; + --colors-dataviz-warning: #{$COLORS_DATAVIZ_WARNING_LIGHT}; --colors-foreground-neutral: #{$COLORS_FOREGROUND_NEUTRAL_LIGHT}; --colors-foreground-neutral-2: #{$COLORS_FOREGROUND_NEUTRAL_2_LIGHT}; @@ -283,6 +291,8 @@ $COLORS_DATAVIZ_ERROR_DARK: #d64f43 !default; --colors-dataviz-error: #{$COLORS_DATAVIZ_ERROR_DARK}; --colors-dataviz-success: #{$COLORS_DATAVIZ_SUCCESS_DARK}; + --colors-dataviz-info: #{$COLORS_DATAVIZ_INFO_DARK}; + --colors-dataviz-warning: #{$COLORS_DATAVIZ_WARNING_DARK}; --colors-foreground-neutral: #{$COLORS_FOREGROUND_NEUTRAL_DARK}; --colors-foreground-neutral-2: #{$COLORS_FOREGROUND_NEUTRAL_2_DARK}; diff --git a/internal/portal/src/scenes/Destination/Destination.scss b/internal/portal/src/scenes/Destination/Destination.scss index c43e9d06e..fb85b614e 100644 --- a/internal/portal/src/scenes/Destination/Destination.scss +++ b/internal/portal/src/scenes/Destination/Destination.scss @@ -127,22 +127,70 @@ margin-top: var(--spacing-14); margin-bottom: var(--spacing-14); - h2 { - margin-top: 0; + &__header { + display: flex; + align-items: center; + justify-content: space-between; margin-bottom: var(--spacing-4); + + h2 { + margin: 0; + } } - &__metrics { + &__timeframe { display: flex; + gap: var(--spacing-1); + border: 1px solid var(--colors-outline-neutral); + border-radius: var(--radius-m); + padding: 2px; + background: var(--colors-background-neutral-1); + } + + &__tf-btn { + padding: var(--spacing-1) var(--spacing-2); + font-size: 13px; + font-weight: 500; + color: var(--colors-foreground-neutral-3); + + background: transparent; + border: 1px solid transparent; + border-radius: var(--radius-s); + cursor: pointer; + + &:hover { + color: var(--colors-foreground-neutral); + background: var(--colors-background-neutral-2); + } + + &--active { + box-shadow: var(--colors-shadow-button); + color: var(--colors-foreground-neutral); + background: var(--colors-background); + + &:hover { + background: var(--colors-background); + } + } + } + + &__grid { + display: grid; + grid-template-columns: repeat(6, 1fr); gap: var(--spacing-6); } - &__metric { + &__cell { + grid-column: span 3; border: 1px solid var(--colors-outline-neutral); height: 348px; - width: 100%; background: var(--colors-background); border-radius: var(--radius-m); padding: var(--spacing-6); + + &--row2 { + height: 238px; + grid-column: span 2; + } } } diff --git a/internal/portal/src/scenes/Destination/Destination.tsx b/internal/portal/src/scenes/Destination/Destination.tsx index 268864f33..1d3260480 100644 --- a/internal/portal/src/scenes/Destination/Destination.tsx +++ b/internal/portal/src/scenes/Destination/Destination.tsx @@ -13,6 +13,7 @@ import { DestinationTypeReference, } from "../../typings/Destination"; import getLogo from "../../utils/logo"; +import DestinationMetrics from "./DestinationMetrics"; import DestinationSettings from "./DestinationSettings/DestinationSettings"; import { AttemptRoutes } from "./Events/Attempts"; @@ -229,19 +230,7 @@ const Destination = () => {
)} - {/* - TODO: Uncomment when metrics are implemented -
-

Metrics

-
-
-
[TODO]
-
-
-
[TODO]
-
-
-
*/} + } /> diff --git a/internal/portal/src/scenes/Destination/DestinationMetrics.tsx b/internal/portal/src/scenes/Destination/DestinationMetrics.tsx new file mode 100644 index 000000000..22a1f9d22 --- /dev/null +++ b/internal/portal/src/scenes/Destination/DestinationMetrics.tsx @@ -0,0 +1,232 @@ +import { useState } from "react"; +import MetricsChart, { + ChartDataPoint, +} from "../../common/MetricsChart/MetricsChart"; +import MetricsBreakdown from "../../common/MetricsChart/MetricsBreakdown"; +import { Timeframe, useMetrics } from "../../common/MetricsChart/useMetrics"; +import { Destination } from "../../typings/Destination"; + +const TIMEFRAMES: Timeframe[] = ["1h", "24h", "7d", "30d"]; + +function formatLabel(iso: string, timeframe: Timeframe): string { + const d = new Date(iso); + if (timeframe === "1h" || timeframe === "24h") { + return d.toLocaleTimeString([], { + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + } + return d.toLocaleDateString([], { month: "short", day: "numeric" }); +} + +function toTimeSeriesData( + data: { time_bucket: string; metrics: Record }[] | undefined, + keys: string[], + timeframe: Timeframe, +): ChartDataPoint[] { + if (!data) return []; + return data.map((d) => { + const point: ChartDataPoint = { + label: formatLabel(d.time_bucket, timeframe), + }; + for (const k of keys) { + point[k] = d.metrics[k] ?? 0; + } + return point; + }); +} + +function hasActivity( + data: { metrics: Record }[] | undefined, +): boolean { + if (!data || data.length === 0) return false; + return data.some( + (d) => + (d.metrics.successful_count ?? 0) + (d.metrics.failed_count ?? 0) > 0, + ); +} + +interface DestinationMetricsProps { + destination: Destination; +} + +const DestinationMetrics: React.FC = ({ + destination, +}) => { + const [timeframe, setTimeframe] = useState("24h"); + + // Row 1 + const event_count = useMetrics({ + measures: ["count"], + destinationId: destination.id, + timeframe, + filters: { attempt_number: "0", manual: "false" }, + }); + + const delivery = useMetrics({ + measures: ["successful_count", "failed_count"], + destinationId: destination.id, + timeframe, + }); + + // Derive from delivery data — if no deliveries happened, all time-series are empty + const has_activity = hasActivity(delivery.data?.data); + + // Row 2 + const error_rate = useMetrics({ + measures: ["error_rate"], + destinationId: destination.id, + timeframe, + }); + + const by_status = useMetrics({ + measures: ["count"], + destinationId: destination.id, + timeframe, + dimensions: ["code"], + }); + + const by_topic = useMetrics({ + measures: ["count", "error_rate"], + destinationId: destination.id, + timeframe, + dimensions: ["topic"], + }); + + return ( +
+
+

Metrics

+
+ {TIMEFRAMES.map((tf) => ( + + ))} +
+
+
+ {/* Row 1 — Volume */} +
+ +
+
+ +
+ + {/* Row 2 — Error Breakdown (3 columns) */} +
+ `${(v * 100).toFixed(0)}%`} + yAllowDecimals={true} + tooltipFormatter={(value) => [ + `${(Number(value) * 100).toFixed(1)}%`, + "Error rate", + ]} + /> +
+ {destination.type === "webhook" && ( +
+ { + const n = Number(code); + if (n >= 200 && n < 300) return "success"; + if (n >= 400) return "error"; + return undefined; + }} + /> +
+ )} +
+ +
+
+
+ ); +}; + +export default DestinationMetrics; diff --git a/internal/portal/src/scenes/DestinationsList/DestinationEventsCell.tsx b/internal/portal/src/scenes/DestinationsList/DestinationEventsCell.tsx new file mode 100644 index 000000000..f00a88c2a --- /dev/null +++ b/internal/portal/src/scenes/DestinationsList/DestinationEventsCell.tsx @@ -0,0 +1,44 @@ +import Sparkline from "../../common/Sparkline/Sparkline"; +import { useMetrics } from "../../common/MetricsChart/useMetrics"; + +interface DestinationEventsCellProps { + destinationId: string; +} + +function formatCount(n: number): string { + if (n >= 1_000_000) return `${(n / 1_000_000).toFixed(1)}M`; + if (n >= 1_000) return `${(n / 1_000).toFixed(1)}k`; + return String(n); +} + +const DestinationEventsCell: React.FC = ({ + destinationId, +}) => { + const { data, isLoading } = useMetrics({ + measures: ["successful_count", "failed_count"], + destinationId, + timeframe: "24h", + granularity: "4h", + filters: { attempt_number: "0", manual: "false" }, + }); + + if (isLoading || !data) { + return ; + } + + const points = data.data.map((d) => ({ + successful: d.metrics.successful_count ?? 0, + failed: d.metrics.failed_count ?? 0, + })); + + const total = points.reduce((sum, p) => sum + p.successful + p.failed, 0); + + return ( + <> + + {formatCount(total)} + + ); +}; + +export default DestinationEventsCell; diff --git a/internal/portal/src/scenes/DestinationsList/DestinationList.scss b/internal/portal/src/scenes/DestinationsList/DestinationList.scss index 7dba36cb4..d3b1d0840 100644 --- a/internal/portal/src/scenes/DestinationsList/DestinationList.scss +++ b/internal/portal/src/scenes/DestinationsList/DestinationList.scss @@ -43,3 +43,24 @@ display: block; padding: calc(var(--spacing-2) - 2px) var(--spacing-2); } + +.histogram-cell__loading { + background-color: var(--colors-background-neutral-3); + border-radius: var(--radius-s); + height: 16px; + width: 80px; + opacity: 0.6; + animation: histogram-pulse 1.5s ease-in-out 0.5s infinite; + + @keyframes histogram-pulse { + 0% { + opacity: 0.6; + } + 50% { + opacity: 1; + } + 100% { + opacity: 0.6; + } + } +} diff --git a/internal/portal/src/scenes/DestinationsList/DestinationList.tsx b/internal/portal/src/scenes/DestinationsList/DestinationList.tsx index d1240dbee..af565db71 100644 --- a/internal/portal/src/scenes/DestinationsList/DestinationList.tsx +++ b/internal/portal/src/scenes/DestinationsList/DestinationList.tsx @@ -15,6 +15,7 @@ import CONFIGS from "../../config"; import { useDestinationTypes } from "../../destination-types"; import { Destination } from "../../typings/Destination"; import getLogo from "../../utils/logo"; +import DestinationEventsCell from "./DestinationEventsCell"; const DestinationList: React.FC = () => { const { data: destinations } = useSWR("destinations"); @@ -30,9 +31,7 @@ const DestinationList: React.FC = () => { { header: "Target" }, CONFIGS.TOPICS ? { header: "Topics", width: 120 } : null, { header: "Status", width: 120 }, - // TODO: Uncomment when metrics are implemented - // { header: "Success Rate", width: 120 }, - // { header: "Events (24h)", width: 120 }, + { header: "Events 24h", width: 140 }, ].filter((column) => column !== null); const filtered_destinations = @@ -123,9 +122,7 @@ const DestinationList: React.FC = () => { ) : ( ), - // TODO: Uncomment when metrics are implemented - // 99.5% [TODO], - // 100 [TODO], + , ].filter((entry) => entry !== null), link: `/destinations/${destination.id}`, })) || []; diff --git a/internal/util/testutil/event.go b/internal/util/testutil/event.go index c778001f7..c8c962c99 100644 --- a/internal/util/testutil/event.go +++ b/internal/util/testutil/event.go @@ -174,6 +174,12 @@ func (f *mockAttemptFactory) WithStatus(status string) func(*models.Attempt) { } } +func (f *mockAttemptFactory) WithCode(code string) func(*models.Attempt) { + return func(attempt *models.Attempt) { + attempt.Code = code + } +} + func (f *mockAttemptFactory) WithTime(time time.Time) func(*models.Attempt) { return func(attempt *models.Attempt) { attempt.Time = time diff --git a/scripts/metrics/README.md b/scripts/metrics/README.md new file mode 100644 index 000000000..76f980c98 --- /dev/null +++ b/scripts/metrics/README.md @@ -0,0 +1,68 @@ +# Metrics Seed & QA Scripts + +Scripts for seeding metrics data and QA-testing the portal's metrics dashboard. + +## Overview + +The metrics dashboard shows 5 charts per destination (events count, delivery events, error rate, by status code, by topic) plus a sparkline per row in the destinations list. All charts use the attempts endpoint (`/metrics/attempts`). + +These scripts generate realistic event→attempt chains in the local Postgres DB so you can visually verify the dashboard across different scenarios. + +## Scripts + +### `seed_metrics.sh` + +Generates seed data (events + attempts) with configurable error rates, retry chains, and time distribution. + +```bash +./scripts/metrics/seed_metrics.sh # seed with defaults (10k events, 30d) +./scripts/metrics/seed_metrics.sh --dry-run # print SQL only +./scripts/metrics/seed_metrics.sh --clean # delete seed data only +``` + +All seeded rows use `seed_` ID prefix for easy cleanup. Key tunables via env vars: + +| Env var | Default | Description | +|---------|---------|-------------| +| `SEED_EVENTS` | 10000 | Number of events | +| `SEED_DAYS` | 30 | Spread over N days (0 = last hour) | +| `SEED_ERROR_RATE` | 0.35 | Failure rate per attempt | +| `SEED_RETRY_FRAC` | 0.4 | Fraction of events that get retries | +| `SEED_MAX_RETRIES` | 3 | Max retry attempts per event (1-3) | +| `SEED_TOPICS` | order.completed,... | Comma-separated topics | +| `SEED_CODES` | 500,422 | Failure HTTP codes | +| `SEED_DESTINATIONS` | auto-detected | Falls back to `des_test` | + +### `qa_metrics.sh` + +Wraps `seed_metrics.sh` with named scenarios and verification checklists. + +```bash +./scripts/metrics/qa_metrics.sh # list scenarios +./scripts/metrics/qa_metrics.sh healthy # run one scenario +./scripts/metrics/qa_metrics.sh all # walk through all interactively +``` + +Each scenario cleans existing seed data, seeds fresh data, and prints what to verify in the portal. + +## Scenarios + +| Scenario | Events | Error rate | What it tests | +|----------|--------|------------|---------------| +| `healthy` | 10k | 5% | Baseline — smooth charts, events ≈ deliveries | +| `failing` | 750 | 85% | Destination down — deliveries >> events | +| `spike` | 1k | 60% | Volume spike 3 days ago | +| `empty` | 0 | — | Empty state rendering | +| `single` | 1 | 0% | Minimal data edge case | +| `all-fail` | 500 | 100% | All red, 100% error rate | +| `all-success` | 500 | 0% | All green, 0% error rate | +| `recent` | 60 | 30% | Last hour only (1m granularity) | +| `many-topics` | 1k | 30% | 10 topics in breakdown table | +| `many-codes` | 1k | 50% | 9 HTTP status codes | +| `retry-heavy` | 500 | 40% | Events vs deliveries gap (2-3x) | + +## Requirements + +- Outpost running locally via `docker compose` +- Portal at `http://localhost:3333` +- A destination must exist (scripts default to `des_test`) diff --git a/scripts/metrics/qa_metrics.sh b/scripts/metrics/qa_metrics.sh new file mode 100755 index 000000000..6ec1fe2bd --- /dev/null +++ b/scripts/metrics/qa_metrics.sh @@ -0,0 +1,371 @@ +#!/usr/bin/env bash +# +# QA test scenarios for the metrics dashboard. +# +# Usage: +# ./scripts/qa_metrics.sh # list scenarios +# ./scripts/qa_metrics.sh healthy # run a scenario +# ./scripts/qa_metrics.sh all # run all scenarios interactively +# +# Each scenario cleans existing seed data, seeds fresh data with realistic +# event→attempt chains (retries share event_id), and prints a checklist. +# +# Dashboard layout (2 rows, 5 API calls): +# Row 1: Event count (attempt_number=1) | Delivery events (stacked success/failed) +# Row 2: Error rate | By status code (webhook only) | By topic +# +# Destinations list: 24h sparkline per row (attempt_number=1, 4h granularity) + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SEED_SCRIPT="$SCRIPT_DIR/seed_metrics.sh" + +if [[ ! -x "$SEED_SCRIPT" ]]; then + chmod +x "$SEED_SCRIPT" +fi + +# ── Scenario definitions ──────────────────────────────────────────────────── + +run_healthy() { + echo "━━━ Scenario: HEALTHY DESTINATION ━━━" + echo "Low error rate, few retries, steady traffic. The happy path." + echo "" + + SEED_EVENTS=10000 SEED_DAYS=30 SEED_ERROR_RATE=0.05 SEED_RETRY_FRAC=0.1 \ + SEED_MAX_RETRIES=1 SEED_MANUAL_RATE=0.02 SEED_DENSE_RATIO=0.2 SEED_DENSE_DAY=5 \ + SEED_TOPICS="order.completed,payment.processed,user.signup" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + Row 1: + [ ] Event count: steady bars across 30d + [ ] Delivery events: mostly green, tiny red slivers + [ ] Event count ≈ delivery events (few retries) + Row 2: + [ ] Error rate: near 5%, spiky in 30d due to sparse buckets (try 7d for smoother line) + [ ] By status code: 200/201 dominate, few 500/422 + [ ] By topic: 3 topics, all with low error rate + Sparkline: + [ ] Destinations list shows mini bar chart per destination +EOF +} + +run_failing() { + echo "━━━ Scenario: FAILING DESTINATION ━━━" + echo "85% error rate, most events retry multiple times. Destination is down." + echo "" + + SEED_EVENTS=750 SEED_DAYS=30 SEED_ERROR_RATE=0.85 SEED_RETRY_FRAC=0.8 \ + SEED_MAX_RETRIES=3 SEED_MANUAL_RATE=0.05 SEED_DENSE_RATIO=0.3 \ + SEED_TOPICS="order.completed,payment.processed" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + Row 1: + [ ] Event count: moderate bars + [ ] Delivery events: MUCH taller than event count (retries inflate it) + [ ] Delivery events: mostly red, some green + Row 2: + [ ] Error rate: line near 85% + [ ] Error rate Y-axis: shows percentages (0%, 25%, 50%, 75%, 100%), NOT integers + [ ] Error rate line: smooth, NOT zigzag between 0 and 1 + [ ] By status code: 500/422 dominate + [ ] By topic: both topics ~85% error rate + KEY CHECK: + [ ] Delivery events count >> Event count (retries make more attempts) +EOF +} + +run_spike() { + echo "━━━ Scenario: INCIDENT SPIKE ━━━" + echo "Volume spike 3 days ago with 60% error rate and heavy retries." + echo "" + + SEED_EVENTS=1000 SEED_DAYS=30 SEED_DENSE_DAY=3 SEED_DENSE_RATIO=0.5 \ + SEED_ERROR_RATE=0.6 SEED_RETRY_FRAC=0.7 SEED_MAX_RETRIES=3 \ + SEED_MANUAL_RATE=0.15 \ + SEED_TOPICS="order.completed,payment.processed,user.signup,invoice.created" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 30d view: + [ ] Event count: visible volume spike 3 days ago + [ ] Delivery events: spike even larger (retries amplify it) + [ ] Error rate: ~60% overall (smoother near spike due to more data, noisy elsewhere) + 7d view: + [ ] Volume spike more prominent + Breakdown: + [ ] By status code: 500/422 significant + [ ] By topic: 4 topics visible, all ~60% error rate +EOF +} + +run_empty() { + echo "━━━ Scenario: EMPTY STATE ━━━" + echo "No data — verify empty state rendering." + echo "" + + SEED_EVENTS=0 "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + [ ] All charts show "No data" empty state (gated by hasActivity) + [ ] No JS errors in browser console + [ ] Timeframe buttons still work (switching doesn't crash) + [ ] Loading spinners appear briefly on switch +EOF +} + +run_single() { + echo "━━━ Scenario: SINGLE EVENT ━━━" + echo "1 event, 1 attempt — minimal data edge case." + echo "" + + SEED_EVENTS=1 SEED_DAYS=1 SEED_ERROR_RATE=0 SEED_RETRY_FRAC=0 \ + SEED_MANUAL_RATE=0 SEED_DENSE_RATIO=0 \ + SEED_TOPICS="order.completed" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 24h view: + [ ] Event count: single bar, count=1 + [ ] Delivery events: single green bar, count=1 + [ ] Event count = delivery events (no retries) + [ ] Error rate: 0% + [ ] By status code: single row (200 or 201), count=1 (webhook only) + [ ] By topic: single row (order.completed), 0% error + Edge cases: + [ ] Charts don't look broken with 1 point + [ ] Tooltips work on hover +EOF +} + +run_all_fail() { + echo "━━━ Scenario: 100% ERROR RATE ━━━" + echo "Every attempt fails, many retries — worst case." + echo "" + + SEED_EVENTS=500 SEED_DAYS=7 SEED_ERROR_RATE=1.0 SEED_RETRY_FRAC=0.7 \ + SEED_MAX_RETRIES=3 SEED_MANUAL_RATE=0 \ + SEED_TOPICS="webhook.delivery" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 7d view: + [ ] Event count: moderate bars + [ ] Delivery events: ALL red, zero green — much taller than event count + [ ] Error rate: flat line at 100% + [ ] Error rate Y-axis: 100% at top + [ ] By status code: only 500/422, no 200/201 + [ ] By topic: "webhook.delivery" with 100.0% error + Critical: + [ ] successful_count = 0 everywhere + [ ] No division-by-zero errors +EOF +} + +run_all_success() { + echo "━━━ Scenario: 0% ERROR RATE ━━━" + echo "Every attempt succeeds, no retries." + echo "" + + SEED_EVENTS=500 SEED_DAYS=7 SEED_ERROR_RATE=0 SEED_RETRY_FRAC=0 \ + SEED_MANUAL_RATE=0 SEED_DENSE_RATIO=0.3 \ + SEED_TOPICS="order.completed,user.signup" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 7d view: + [ ] Event count = delivery events (no retries at all) + [ ] Delivery events: ALL green, zero red + [ ] Error rate: flat line at 0% + [ ] By status code: only 200/201 + [ ] By topic: 2 topics, both 0.0% error + Critical: + [ ] failed_count = 0 everywhere + [ ] Error rate chart renders at 0 (not blank) +EOF +} + +run_recent() { + echo "━━━ Scenario: RECENT DATA (1h view) ━━━" + echo "60 events in the last ~55 minutes — test 1m granularity." + echo "" + + SEED_EVENTS=60 SEED_DAYS=0 \ + SEED_ERROR_RATE=0.3 SEED_RETRY_FRAC=0.4 SEED_MAX_RETRIES=2 \ + SEED_MANUAL_RATE=0.1 \ + SEED_TOPICS="order.completed,user.signup" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 1h view: + [ ] Data visible with per-minute granularity + [ ] X-axis shows HH:MM format + [ ] Event count: ~60 bars spread across the hour + [ ] Delivery events: taller due to retries + 24h view: + [ ] Data clustered in the most recent hour + 7d / 30d views: + [ ] All data on today's bar +EOF +} + +run_many_topics() { + echo "━━━ Scenario: MANY TOPICS ━━━" + echo "10 different topics — test breakdown table layout." + echo "" + + SEED_EVENTS=1000 SEED_DAYS=30 SEED_ERROR_RATE=0.3 SEED_RETRY_FRAC=0.3 \ + SEED_MAX_RETRIES=2 SEED_DENSE_RATIO=0.3 \ + SEED_TOPICS="order.completed,order.refunded,order.cancelled,payment.processed,payment.failed,user.signup,user.updated,user.deleted,invoice.created,invoice.paid" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 30d view: + [ ] By topic: 10 rows visible + [ ] Sorted descending by count + [ ] Counts roughly equal (~130 each, includes retries) + [ ] Error rates shown per topic + [ ] Table doesn't overflow or break layout + [ ] Bar widths proportional to count +EOF +} + +run_many_codes() { + echo "━━━ Scenario: MANY STATUS CODES ━━━" + echo "Varied HTTP codes — test status code breakdown." + echo "" + + SEED_EVENTS=1000 SEED_DAYS=30 SEED_ERROR_RATE=0.5 SEED_RETRY_FRAC=0.5 \ + SEED_MAX_RETRIES=2 SEED_DENSE_RATIO=0.3 \ + SEED_CODES="500,502,503,422,400,403,429" \ + SEED_SUCCESS_CODES="200,201" \ + SEED_TOPICS="order.completed,payment.processed,user.signup" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 30d view: + [ ] By status code: 9 codes visible (200,201,400,403,422,429,500,502,503) + [ ] 2xx bars are green, 4xx/5xx bars are red + [ ] Sorted descending by count + [ ] Bar widths proportional + [ ] Error rate ~50% +EOF +} + +run_retry_heavy() { + echo "━━━ Scenario: RETRY HEAVY ━━━" + echo "Most events retry 2-3 times. Tests event_count vs delivery_events gap." + echo "" + + SEED_EVENTS=500 SEED_DAYS=7 SEED_ERROR_RATE=0.4 SEED_RETRY_FRAC=0.9 \ + SEED_MAX_RETRIES=3 SEED_MANUAL_RATE=0.2 SEED_DENSE_RATIO=0.3 \ + SEED_TOPICS="order.completed,payment.processed" \ + "$SEED_SCRIPT" + + cat << 'EOF' + +── Verify ── + 7d view: + [ ] Event count bars are SHORTER than delivery events bars + [ ] Ratio: delivery_events should be ~2-3x event_count + KEY CHECK: + This scenario exists to verify that the event_count (attempt_number=1) + chart shows fewer items than the delivery events (all attempts) chart. + If they look identical, the retry chains are not working correctly. +EOF +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +print_usage() { + cat << 'EOF' +Metrics Dashboard QA Scenarios: + + healthy Low error rate, steady traffic (baseline) + failing 85% error rate, destination is down + spike Incident spike 3 days ago + empty No data (empty state) + single Single event, single attempt + all-fail 100% error rate, many retries + all-success 0% error rate, no retries + recent Last hour only (1m granularity test) + many-topics 10 topics (breakdown table test) + many-codes 9 HTTP status codes (code breakdown test) + retry-heavy 90% events retry 2-3x (event_count vs delivery gap test) + all Run all scenarios interactively + +Usage: ./scripts/qa_metrics.sh +EOF +} + +run_all() { + local scenarios=(healthy failing spike empty single all-fail all-success recent many-topics many-codes retry-heavy) + for scenario in "${scenarios[@]}"; do + echo "" + echo "╔══════════════════════════════════════════════════════════════╗" + echo "║ Next scenario: $scenario" + echo "╚══════════════════════════════════════════════════════════════╝" + echo "" + read -rp "Press Enter to run '$scenario' (or 'q' to quit): " input + [[ "$input" == "q" ]] && break + + case "$scenario" in + healthy) run_healthy ;; + failing) run_failing ;; + spike) run_spike ;; + empty) run_empty ;; + single) run_single ;; + all-fail) run_all_fail ;; + all-success) run_all_success ;; + recent) run_recent ;; + many-topics) run_many_topics ;; + many-codes) run_many_codes ;; + retry-heavy) run_retry_heavy ;; + esac + + echo "" + echo "────────────────────────────────────────────────────────────────" + echo " QA the portal now. When ready, press Enter for next scenario." + echo "────────────────────────────────────────────────────────────────" + read -rp "" + done + echo "Done!" +} + +case "${1:-}" in + healthy) run_healthy ;; + failing) run_failing ;; + spike) run_spike ;; + empty) run_empty ;; + single) run_single ;; + all-fail) run_all_fail ;; + all-success) run_all_success ;; + recent) run_recent ;; + many-topics) run_many_topics ;; + many-codes) run_many_codes ;; + retry-heavy) run_retry_heavy ;; + all) run_all ;; + *) print_usage ;; +esac diff --git a/scripts/metrics/seed_metrics.sh b/scripts/metrics/seed_metrics.sh new file mode 100755 index 000000000..ab8bf6369 --- /dev/null +++ b/scripts/metrics/seed_metrics.sh @@ -0,0 +1,317 @@ +#!/usr/bin/env bash +# +# Seed metrics data for local testing. +# +# Generates realistic event→attempt chains: each event gets a first attempt +# (attempt_number=1) and optionally 1-3 retries (attempt_number=2,3,4). +# Earlier attempts in a retry chain fail; the final attempt may succeed or fail. +# +# Usage: +# ./scripts/seed_metrics.sh # defaults +# ./scripts/seed_metrics.sh --dry-run # print SQL only +# ./scripts/seed_metrics.sh --clean # delete seed data only +# SEED_EVENTS=200 ./scripts/seed_metrics.sh +# +# Tunables (env vars): +# SEED_TENANT - tenant ID (default: tenant_1) +# SEED_EVENTS - number of events (default: 150) +# SEED_DAYS - spread over N days back (default: 30, 0 = last hour) +# SEED_DENSE_DAY - day offset for spike, 0=today (default: 4) +# SEED_DENSE_RATIO - fraction of events on spike (default: 0.4) +# SEED_ERROR_RATE - overall failure rate (default: 0.35) +# SEED_RETRY_FRAC - fraction of events that retry (default: 0.4) +# SEED_MAX_RETRIES - max retries per event (1-3) (default: 3) +# SEED_MANUAL_RATE - fraction of retries that are manual (default: 0.1) +# SEED_DESTINATIONS - comma-separated dest IDs (auto-detected from DB) +# SEED_TOPICS - comma-separated topics +# SEED_CODES - failure HTTP codes (default: 500,422) +# SEED_SUCCESS_CODES - success HTTP codes (default: 200,201) +# PG_CONTAINER - docker container name (default: outpost-deps-postgres-1) +# PG_USER - postgres user (default: outpost) +# PG_DB - postgres database (default: outpost) +# +# IDs are prefixed with "seed_" for easy cleanup: +# DELETE FROM attempts WHERE id LIKE 'seed_%'; +# DELETE FROM events WHERE id LIKE 'seed_%'; + +set -euo pipefail + +# ── Tunables ────────────────────────────────────────────────────────────────── + +TENANT="${SEED_TENANT:-tenant_1}" +EVENTS="${SEED_EVENTS:-10000}" +DAYS="${SEED_DAYS:-30}" +DENSE_DAY="${SEED_DENSE_DAY:-4}" +DENSE_RATIO="${SEED_DENSE_RATIO:-0.4}" +ERROR_RATE="${SEED_ERROR_RATE:-0.35}" +RETRY_FRAC="${SEED_RETRY_FRAC:-0.4}" +MAX_RETRIES="${SEED_MAX_RETRIES:-3}" +MANUAL_RATE="${SEED_MANUAL_RATE:-0.1}" +PG_CONTAINER="${PG_CONTAINER:-outpost-deps-postgres-1}" +PG_USER="${PG_USER:-outpost}" +PG_DB="${PG_DB:-outpost}" + +IFS=',' read -ra TOPICS <<< "${SEED_TOPICS:-order.completed,payment.processed,user.signup}" +IFS=',' read -ra FAIL_CODES <<< "${SEED_CODES:-500,422}" +IFS=',' read -ra OK_CODES <<< "${SEED_SUCCESS_CODES:-200,201}" + +DRY_RUN=false +CLEAN_ONLY=false +[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true +[[ "${1:-}" == "--clean" ]] && CLEAN_ONLY=true + +# ── Clean helper ──────────────────────────────────────────────────────────── + +clean_seed_data() { + echo "Cleaning existing seed data..." + docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -q \ + -c "DELETE FROM attempts WHERE id LIKE 'seed_%'; DELETE FROM events WHERE id LIKE 'seed_%';" + echo "Cleaned." +} + +if $CLEAN_ONLY; then + clean_seed_data + exit 0 +fi + +if (( EVENTS == 0 )); then + clean_seed_data + exit 0 +fi + +# ── Auto-detect destinations ───────────────────────────────────────────────── + +if [[ -z "${SEED_DESTINATIONS:-}" ]]; then + DEST_RAW=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -A \ + -c "SELECT DISTINCT destination_id FROM attempts WHERE tenant_id = '$TENANT' AND destination_id != '' AND id NOT LIKE 'seed_%' LIMIT 5;" 2>/dev/null || true) + if [[ -z "$DEST_RAW" ]]; then + DEST_RAW="des_test" + echo "Warning: No destinations found for $TENANT, using fallback ID: des_test" + fi + IFS=$'\n' read -ra DESTINATIONS <<< "$DEST_RAW" +else + IFS=',' read -ra DESTINATIONS <<< "$SEED_DESTINATIONS" +fi + +NUM_TOPICS=${#TOPICS[@]} +NUM_DESTS=${#DESTINATIONS[@]} +NUM_FAIL_CODES=${#FAIL_CODES[@]} +NUM_OK_CODES=${#OK_CODES[@]} + +# Pre-compute integer thresholds +error_thresh=$(awk "BEGIN {printf \"%d\", $ERROR_RATE * 100}") +retry_thresh=$(awk "BEGIN {printf \"%d\", $RETRY_FRAC * 100}") +manual_thresh=$(awk "BEGIN {printf \"%d\", $MANUAL_RATE * 100}") + +echo "── Seed config ──" +echo " tenant: $TENANT" +echo " events: $EVENTS" +echo " days back: $DAYS" +echo " dense day: $DENSE_DAY days ago" +echo " dense ratio: $DENSE_RATIO" +echo " error rate: $ERROR_RATE" +echo " retry frac: $RETRY_FRAC" +echo " max retries: $MAX_RETRIES" +echo " manual rate: $MANUAL_RATE" +echo " topics: ${TOPICS[*]}" +echo " destinations: ${DESTINATIONS[*]}" +echo " fail codes: ${FAIL_CODES[*]}" +echo " ok codes: ${OK_CODES[*]}" +echo "" + +# ── Generate SQL ────────────────────────────────────────────────────────────── + +generate_sql() { + local now_epoch today_start + now_epoch=$(date +%s) + today_start=$(( now_epoch - (now_epoch % 86400) )) + + local dense_count sparse_count + if (( DAYS == 0 )); then + dense_count=0 + sparse_count=$EVENTS + else + dense_count=$(awk "BEGIN {printf \"%d\", $EVENTS * $DENSE_RATIO}") + sparse_count=$(( EVENTS - dense_count )) + fi + + local total_attempts=0 + local total_retries=0 + + cat << 'HEADER' +-- Seed metrics data (auto-generated) +-- Clean up: DELETE FROM attempts WHERE id LIKE 'seed_%'; DELETE FROM events WHERE id LIKE 'seed_%'; + +BEGIN; +HEADER + + local atm_seq=0 + + for (( i = 0; i < EVENTS; i++ )); do + # ── Event time distribution ── + local evt_epoch + if (( DAYS == 0 )); then + local minutes_ago=$(( (EVENTS - i) * 55 / EVENTS )) + evt_epoch=$(( now_epoch - (minutes_ago * 60) + (i % 30) )) + elif (( i < sparse_count )); then + local day_offset=$(( (i * DAYS / sparse_count) )) + [[ "$day_offset" -eq "$DENSE_DAY" ]] && day_offset=$(( day_offset + 1 )) + local hour=$(( (i % 10) + 8 )) + local minute=$(( (i * 7) % 60 )) + evt_epoch=$(( today_start - (day_offset * 86400) + (hour * 3600) + (minute * 60) )) + else + local dense_i=$(( i - sparse_count )) + local bucket + local pct=$(( dense_i * 100 / dense_count )) + if (( pct < 10 )); then bucket=10 + elif (( pct < 30 )); then bucket=11 + elif (( pct < 70 )); then bucket=12 + elif (( pct < 90 )); then bucket=13 + else bucket=14 + fi + local minute=$(( (dense_i * 13) % 60 )) + local second=$(( (dense_i * 7) % 60 )) + evt_epoch=$(( today_start - (DENSE_DAY * 86400) + (bucket * 3600) + (minute * 60) + second )) + fi + + local evt_ts + evt_ts=$(date -r "$evt_epoch" -u '+%Y-%m-%dT%H:%M:%S+00' 2>/dev/null || \ + date -d "@$evt_epoch" -u '+%Y-%m-%dT%H:%M:%S+00' 2>/dev/null) + + # ── Event dimensions ── + local topic="${TOPICS[$(( i % NUM_TOPICS ))]}" + local dest="${DESTINATIONS[$(( i % NUM_DESTS ))]}" + local eligible="true" + if (( i % 3 == 2 )); then eligible="false"; fi + + local evt_id="seed_evt_$(printf '%04d' $i)" + + # Event row + echo "INSERT INTO events (id, tenant_id, destination_id, time, topic, eligible_for_retry, data, metadata)" + echo " VALUES ('$evt_id', '$TENANT', '$dest', '$evt_ts', '$topic', $eligible, '{\"seed\":true,\"index\":$i}', '{\"source\":\"seed\"}');" + + # ── Decide retry chain for this event ── + # Does this event get retries? + local retry_hash=$(( (i * 53 + 7) % 100 )) + local num_retries=0 + if (( retry_hash < retry_thresh )); then + # 1 to MAX_RETRIES retries + num_retries=$(( (i % MAX_RETRIES) + 1 )) + fi + + local total_chain=$(( num_retries + 1 )) # first attempt + retries + + # ── Generate attempts for this event ── + # Pattern: first attempt (attempt_number=1), then retries (2,3,...) + # All attempts except the last one FAIL (that's why we retry). + # The last attempt succeeds or fails based on error_rate. + for (( a = 1; a <= total_chain; a++ )); do + local atm_id="seed_atm_$(printf '%05d' $atm_seq)" + atm_seq=$(( atm_seq + 1 )) + + # Attempt time: event_time + (attempt_number * 30-120 seconds) + local delay=$(( (a * 60) + (atm_seq % 60) + 1 )) + local atm_epoch=$(( evt_epoch + delay )) + local atm_ts + atm_ts=$(date -r "$atm_epoch" -u '+%Y-%m-%dT%H:%M:%S+00' 2>/dev/null || \ + date -d "@$atm_epoch" -u '+%Y-%m-%dT%H:%M:%S+00' 2>/dev/null) + + local status code manual + + # Every attempt independently uses SEED_ERROR_RATE + local err_hash=$(( (i * 97 + a * 31 + 13) % 100 )) + if (( err_hash < error_thresh )); then + status="failed" + code="${FAIL_CODES[$(( atm_seq % NUM_FAIL_CODES ))]}" + else + status="success" + code="${OK_CODES[$(( atm_seq % NUM_OK_CODES ))]}" + fi + + # Manual: only retries (attempt_number > 1) can be manual + manual="false" + if (( a > 1 )); then + local manual_hash=$(( (atm_seq * 31 + 17) % 100 )) + if (( manual_hash < manual_thresh )); then + manual="true" + fi + total_retries=$(( total_retries + 1 )) + fi + + total_attempts=$(( total_attempts + 1 )) + + echo "INSERT INTO attempts (id, event_id, destination_id, status, time, code, response_data, manual, attempt_number, tenant_id, topic, event_time, eligible_for_retry, event_data, event_metadata)" + echo " VALUES ('$atm_id', '$evt_id', '$dest', '$status', '$atm_ts', '$code', '{\"seed\":true}', $manual, $a, '$TENANT', '$topic', '$evt_ts', $eligible, '{\"seed\":true,\"index\":$i}', '{\"source\":\"seed\"}');" + done + done + + echo "COMMIT;" + + # Output stats to stderr so they don't end up in SQL + echo "STATS:$EVENTS:$total_attempts:$total_retries" >&2 +} + +# ── Clean existing seed data first ──────────────────────────────────────────── + +clean_seed_data + +# ── Generate ───────────────────────────────────────────────────────────────── + +STATS="" +SQL=$(generate_sql 2> >(while read -r line; do + if [[ "$line" == STATS:* ]]; then + STATS="$line" + else + echo "$line" >&2 + fi +done; echo "$STATS" > /tmp/seed_metrics_stats)) + +# Read stats +if [[ -f /tmp/seed_metrics_stats ]]; then + STATS=$(cat /tmp/seed_metrics_stats) + rm -f /tmp/seed_metrics_stats +fi + +IFS=':' read -r _ stat_events stat_attempts stat_retries <<< "$STATS" +stat_events="${stat_events:-$EVENTS}" +stat_attempts="${stat_attempts:-?}" +stat_retries="${stat_retries:-?}" + +if $DRY_RUN; then + echo "$SQL" + echo "" + echo "── Dry run complete. $stat_events events + $stat_attempts attempts ($stat_retries retries) would be inserted." + exit 0 +fi + +# ── Execute ─────────────────────────────────────────────────────────────────── + +echo "Inserting $stat_events events + $stat_attempts attempts ($stat_retries retries)..." +echo "$SQL" | docker exec -i "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -q + +# Verify +COUNTS=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -A -c \ + "SELECT 'events=' || count(*) FROM events WHERE id LIKE 'seed_%' + UNION ALL + SELECT 'attempts=' || count(*) FROM attempts WHERE id LIKE 'seed_%' + UNION ALL + SELECT 'first_attempts=' || count(*) FROM attempts WHERE id LIKE 'seed_%' AND attempt_number = 1 + UNION ALL + SELECT 'retries=' || count(*) FROM attempts WHERE id LIKE 'seed_%' AND attempt_number > 1;") + +echo "" +echo "── Done ──" +echo "$COUNTS" +echo "" + +# Quick distribution check +DIST=$(docker exec "$PG_CONTAINER" psql -U "$PG_USER" -d "$PG_DB" -t -A -c \ + "SELECT + 'success=' || count(*) FILTER (WHERE status = 'success') || + ' failed=' || count(*) FILTER (WHERE status = 'failed') || + ' error_rate=' || round(count(*) FILTER (WHERE status = 'failed')::numeric / NULLIF(count(*), 0), 3) || + ' avg_attempt=' || round(avg(attempt_number)::numeric, 2) + FROM attempts WHERE id LIKE 'seed_%';") +echo "Distribution: $DIST" +echo "" +echo "To clean up: docker exec $PG_CONTAINER psql -U $PG_USER -d $PG_DB -c \"DELETE FROM attempts WHERE id LIKE 'seed_%'; DELETE FROM events WHERE id LIKE 'seed_%';\""