Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 181 additions & 0 deletions pp-pkg/rules/alerting_stuck_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Copyright 2026 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Tests in this file investigate the "stuck D8ControlPlaneManagerPodNotRunning"
// alert behavior, where an alert keeps firing for hours even after the source
// data clearly indicates it should be inactive.
//
// Each test pins down ONE behavioral assumption underlying our diagnosis.

package rules

import (
"context"
"sync"
"testing"
"time"

"github.com/prometheus/prometheus/model/labels"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/promql/parser"
"github.com/stretchr/testify/require"
)

// scriptedQuery returns a QueryFunc that hands out canned promql.Vector values
// from a script. The i-th call returns script[i]; the timestamp/identity is
// tagged onto each Sample's T so the rule sees consistent evaluation time.
func scriptedQuery(script []promql.Vector) (QueryFunc, func() int) {
var (
mu sync.Mutex
i int
)
q := func(_ context.Context, _ string, t time.Time) (promql.Vector, error) {
mu.Lock()
defer mu.Unlock()
if i >= len(script) {
panic("scriptedQuery exhausted")
}
out := make(promql.Vector, len(script[i]))
for j, s := range script[i] {
s.T = t.UnixMilli()
out[j] = s
}
i++
return out, nil
}
return q, func() int {
mu.Lock()
defer mu.Unlock()
return i
}
}

func mustExpr(t *testing.T, src string) parser.Expr {
t.Helper()
e, err := parser.ParseExpr(src)
require.NoError(t, err)
return e
}

// TestAlertingRule_OnePassEmpty_TransitionsToInactive verifies that ONE
// evaluation with an empty query result is enough to move a Firing alert into
// the Inactive state. This is the baseline guarantee for "an alert cannot be
// stuck if the query returns nothing".
func TestAlertingRule_OnePassEmpty_TransitionsToInactive(t *testing.T) {
rule := NewAlertingRule(
"X", mustExpr(t, `up == 0`),
time.Minute, // holdDuration
0, // keepFiringFor
labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "",
true, nil, // restored=true → ALERTS series will be emitted
)

hot := promql.Vector{{
Metric: labels.FromStrings("__name__", "up", "job", "x"),
F: 0,
}}
q, _ := scriptedQuery([]promql.Vector{
hot, // t=0 : pending starts (new entry in r.active)
hot, // t=1m : transitions to firing (holdDuration met)
nil, // t=2m : empty → must transition to Inactive
})

t0 := time.Unix(0, 0).UTC()
for step := 0; step < 3; step++ {
ts := t0.Add(time.Duration(step) * time.Minute)
_, err := rule.Eval(context.TODO(), 0, ts, q, nil, 0)
require.NoError(t, err)
}

require.Len(t, rule.active, 1, "alert entry must still be in r.active (Inactive but kept for resolvedRetention)")
for _, a := range rule.active {
require.Equal(t, StateInactive, a.State,
"after a single empty Eval, an active alert MUST be Inactive — got %s", a.State)
require.False(t, a.ResolvedAt.IsZero(), "ResolvedAt must be set on transition to Inactive")
}
}

// TestAlertingRule_StaysFiringWhileQueryReturnsSameFingerprint pins down the
// other side of the contract: as long as the query returns a sample with the
// same fingerprint, the alert keeps firing indefinitely. This is exactly the
// behaviour we observe in production — so the only way a stuck alert can exist
// is if query() consistently returns a non-empty result.
func TestAlertingRule_StaysFiringWhileQueryReturnsSameFingerprint(t *testing.T) {
rule := NewAlertingRule(
"X", mustExpr(t, `up == 0`),
time.Minute, 0,
labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "",
true, nil,
)

hot := promql.Vector{{
Metric: labels.FromStrings("__name__", "up", "job", "x"),
F: 0,
}}
// 100 consecutive non-empty evaluations: the alert MUST stay firing the entire time.
const evals = 100
script := make([]promql.Vector, evals)
for i := range script {
script[i] = hot
}
q, _ := scriptedQuery(script)

t0 := time.Unix(0, 0).UTC()
for step := 0; step < evals; step++ {
ts := t0.Add(time.Duration(step) * time.Minute)
_, err := rule.Eval(context.TODO(), 0, ts, q, nil, 0)
require.NoError(t, err)
}

require.Len(t, rule.active, 1)
for _, a := range rule.active {
require.Equal(t, StateFiring, a.State,
"alert must stay Firing while query keeps returning the same fingerprint")
}
}

// TestAlertingRule_DroppedFromActiveAfterResolvedRetention verifies the
// 15-minute resolvedRetention window: once the alert is Inactive AND
// resolvedRetention is exceeded, it must be deleted from r.active.
//
// (This eliminates the hypothesis that some glitch could leave a "stale entry"
// sitting in r.active forever — by design, the entry is GC'd after 15min idle.)
func TestAlertingRule_DroppedFromActiveAfterResolvedRetention(t *testing.T) {
rule := NewAlertingRule(
"X", mustExpr(t, `up == 0`),
time.Minute, 0,
labels.EmptyLabels(), labels.EmptyLabels(), labels.EmptyLabels(), "",
true, nil,
)

hot := promql.Vector{{
Metric: labels.FromStrings("__name__", "up", "job", "x"),
F: 0,
}}
// pending(0m), firing(1m), then 20 empty evals one minute apart.
script := []promql.Vector{hot, hot}
for i := 0; i < 20; i++ {
script = append(script, nil)
}
q, _ := scriptedQuery(script)

t0 := time.Unix(0, 0).UTC()
for step := 0; step < len(script); step++ {
ts := t0.Add(time.Duration(step) * time.Minute)
_, err := rule.Eval(context.TODO(), 0, ts, q, nil, 0)
require.NoError(t, err)
}

require.Empty(t, rule.active,
"after >resolvedRetention(15min) of empty evaluations, the alert MUST be removed from r.active")
}
164 changes: 164 additions & 0 deletions pp-pkg/rules/control_plane_expr_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
// Copyright 2026 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Tests in this file run the EXACT alert expression from
// debug/run/rules/control-plane-manager.yaml
// (D8ControlPlaneManagerPodNotRunning)
// against synthetic timeseries shaped like real kube-state-metrics output,
// to lock down what conditions cause the alert to fire vs. stay silent.
//
// We use the upstream PromQL test storage so this file does NOT depend on
// cppbridge or the prompp-specific querier — it isolates the expression
// semantics layer.

package rules

import (
"context"
"testing"
"time"

"github.com/prometheus/prometheus/promql/promqltest"
"github.com/stretchr/testify/require"
)

const cpmExpr = `max by (node) (kube_node_role{role="master"} unless kube_node_role{role="master"}` +
` * on (node) group_left () ((kube_pod_status_ready{condition="true"} == 1) *` +
` on (pod, namespace) group_right () kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system"}))`

// TestCPM_HappyPath_NoFiring: all 3 masters have a healthy DaemonSet pod, all
// labels match, the `unless` cancels everything → no alert series.
func TestCPM_HappyPath_NoFiring(t *testing.T) {
storage := promqltest.LoadedStorage(t, `
load 1m
kube_node_role{role="master",node="m0",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_node_role{role="master",node="m1",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_node_role{role="master",node="m2",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-a",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-b",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-c",instance="ksm",job="kube-state-metrics"} 1+0x10
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m0",pod="cpm-a",job="kube-state-metrics"} 1+0x10
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m1",pod="cpm-b",job="kube-state-metrics"} 1+0x10
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m2",pod="cpm-c",job="kube-state-metrics"} 1+0x10
`)
t.Cleanup(func() { _ = storage.Close() })

ng := testEngine(t)
q := EngineQueryFunc(ng, storage)
res, err := q(context.TODO(), cpmExpr, time.Unix(5*60, 0)) // t=5m
require.NoError(t, err)
require.Empty(t, res, "alert expr must be empty when all masters have healthy pods — got %v", res)
}

// TestCPM_PodGoneForOneMaster_AlertsForThatMaster: kube_pod_status_ready for
// master-1's pod stops being scraped. Within lookback the right-hand side of
// `unless` loses master-1 → master-1 escapes → expr returns one series.
//
// This is the canonical "cause" of the alert firing: a missing pod_status_ready
// readout for one of the masters.
func TestCPM_PodGoneForOneMaster_AlertsForThatMaster(t *testing.T) {
// We deliberately stop kube_pod_status_ready for cpm-b (master-1) at minute 3
// (load 1m → 4 points 0,1,2,3). Default lookback is 5m, so at t=10m the
// querier would still return the last point from t=3m as a stale value —
// we therefore evaluate at t=12m, which is well outside the 5m lookback.
storage := promqltest.LoadedStorage(t, `
load 1m
kube_node_role{role="master",node="m0",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_node_role{role="master",node="m1",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_node_role{role="master",node="m2",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-a",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-b",instance="ksm",job="kube-state-metrics"} 1 1 1 1
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-c",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m0",pod="cpm-a",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m1",pod="cpm-b",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m2",pod="cpm-c",job="kube-state-metrics"} 1+0x20
`)
t.Cleanup(func() { _ = storage.Close() })

ng := testEngine(t)
q := EngineQueryFunc(ng, storage)
res, err := q(context.TODO(), cpmExpr, time.Unix(12*60, 0))
require.NoError(t, err)
require.Len(t, res, 1, "exactly one master must escape the unless")
require.Equal(t, "m1", res[0].Metric.Get("node"))
}

// TestCPM_RecoveryClearsAlert: same as the previous test, but the missing
// kube_pod_status_ready readout reappears at minute 13. By minute 14 the
// expression must again return empty (the alert would transition Inactive in
// AlertingRule.Eval).
//
// This test pins down: "as soon as the data comes back, the expression must
// stop returning the master". If this fails, we have a real engine/data bug.
func TestCPM_RecoveryClearsAlert(t *testing.T) {
storage := promqltest.LoadedStorage(t, `
load 1m
kube_node_role{role="master",node="m0",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_node_role{role="master",node="m1",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_node_role{role="master",node="m2",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-a",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-b",instance="ksm",job="kube-state-metrics"} 1 1 1 1 _ _ _ _ _ _ _ _ _ _ 1 1 1 1 1 1 1
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-c",instance="ksm",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m0",pod="cpm-a",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m1",pod="cpm-b",job="kube-state-metrics"} 1+0x20
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m2",pod="cpm-c",job="kube-state-metrics"} 1+0x20
`)
t.Cleanup(func() { _ = storage.Close() })

ng := testEngine(t)
q := EngineQueryFunc(ng, storage)

// While the gap is open (t=10m), we expect 1 alerting series.
res, err := q(context.TODO(), cpmExpr, time.Unix(10*60, 0))
require.NoError(t, err)
require.Len(t, res, 1, "expected 1 alerting series during the gap")

// After recovery (t=16m, beyond default 5m lookback past the recovered point at 14m),
// the expression MUST be empty again.
res, err = q(context.TODO(), cpmExpr, time.Unix(16*60, 0))
require.NoError(t, err)
require.Empty(t, res, "after recovery the expression MUST be empty — got %v", res)
}

// TestCPM_PodReplaced_OldFingerprintMustNotLinger: simulates the production
// scenario where a DaemonSet pod is recreated (UID/pod-name change). The OLD
// pod stops being scraped and the NEW one starts scraping with a different
// `pod` label. The expression has no business reporting either pod after the
// 5m lookback — but if the old `pod` label lingers in storage and the engine
// still sees it via Select(), the join becomes inconsistent and triggers the
// alert.
//
// This pins down: pod replacement WITHOUT a real outage must not fire the alert.
func TestCPM_PodReplaced_OldFingerprintMustNotLinger(t *testing.T) {
storage := promqltest.LoadedStorage(t, `
load 1m
kube_node_role{role="master",node="m0",instance="ksm",job="kube-state-metrics"} 1+0x30
kube_node_role{role="master",node="m1",instance="ksm",job="kube-state-metrics"} 1+0x30
kube_node_role{role="master",node="m2",instance="ksm",job="kube-state-metrics"} 1+0x30
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-a-old",instance="ksm",job="kube-state-metrics"} 1 1 1 1 1
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-b-old",instance="ksm",job="kube-state-metrics"} 1 1 1 1 1
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-c-old",instance="ksm",job="kube-state-metrics"} 1 1 1 1 1
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m0",pod="cpm-a-old",job="kube-state-metrics"} 1 1 1 1 1
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m1",pod="cpm-b-old",job="kube-state-metrics"} 1 1 1 1 1
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m2",pod="cpm-c-old",job="kube-state-metrics"} 1 1 1 1 1
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-a-new",instance="ksm",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-b-new",instance="ksm",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
kube_pod_status_ready{condition="true",namespace="kube-system",pod="cpm-c-new",instance="ksm",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m0",pod="cpm-a-new",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m1",pod="cpm-b-new",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
kube_controller_pod{controller_name="d8-control-plane-manager",controller_type="DaemonSet",namespace="kube-system",node="m2",pod="cpm-c-new",job="kube-state-metrics"} _ _ _ _ _ 1+0x25
`)
t.Cleanup(func() { _ = storage.Close() })

ng := testEngine(t)
q := EngineQueryFunc(ng, storage)

// Well after replacement and well outside lookback for the OLD generation:
res, err := q(context.TODO(), cpmExpr, time.Unix(20*60, 0))
require.NoError(t, err)
require.Empty(t, res, "after pod replacement settled, the alert MUST stay silent — got %v", res)
}
Loading
Loading