From 0894a44599bd603c19e94ec1e1a6c459bd81bd26 Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Thu, 7 May 2026 22:59:05 -0500 Subject: [PATCH 1/3] Prime some critical stats before the first scrape --- interface.go | 38 ++++++++++++++---------- interface_test.go | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 15 deletions(-) create mode 100644 interface_test.go diff --git a/interface.go b/interface.go index 5fedcdd3..32f5c2a6 100644 --- a/interface.go +++ b/interface.go @@ -491,26 +491,34 @@ func (f *Interface) emitStats(ctx context.Context, i time.Duration) { certInitiatingVersion := metrics.GetOrRegisterGauge("certificate.initiating_version", nil) certMaxVersion := metrics.GetOrRegisterGauge("certificate.max_version", nil) + emit := func() { + f.firewall.EmitStats() + f.handshakeManager.EmitStats() + udpStats() + + certState := f.pki.getCertState() + defaultCrt := certState.GetDefaultCertificate() + certExpirationGauge.Update(int64(defaultCrt.NotAfter().Sub(time.Now()) / time.Second)) + certInitiatingVersion.Update(int64(defaultCrt.Version())) + + // Report the max certificate version we are capable of using + if certState.v2Cert != nil { + certMaxVersion.Update(int64(certState.v2Cert.Version())) + } else { + certMaxVersion.Update(int64(certState.v1Cert.Version())) + } + } + + // Prime gauges so a Prometheus scrape that lands before the first tick + // sees real values instead of the zero defaults (issue #907). + emit() + for { select { case <-ctx.Done(): return case <-ticker.C: - f.firewall.EmitStats() - f.handshakeManager.EmitStats() - udpStats() - - certState := f.pki.getCertState() - defaultCrt := certState.GetDefaultCertificate() - certExpirationGauge.Update(int64(defaultCrt.NotAfter().Sub(time.Now()) / time.Second)) - certInitiatingVersion.Update(int64(defaultCrt.Version())) - - // Report the max certificate version we are capable of using - if certState.v2Cert != nil { - certMaxVersion.Update(int64(certState.v2Cert.Version())) - } else { - certMaxVersion.Update(int64(certState.v1Cert.Version())) - } + emit() } } } diff --git a/interface_test.go b/interface_test.go new file mode 100644 index 00000000..ba98504e --- /dev/null +++ b/interface_test.go @@ -0,0 +1,73 @@ +package nebula + +import ( + "context" + "net/netip" + "testing" + "time" + + "github.com/rcrowley/go-metrics" + "github.com/slackhq/nebula/cert" + "github.com/slackhq/nebula/firewall" + "github.com/slackhq/nebula/overlay/overlaytest" + "github.com/slackhq/nebula/test" + "github.com/slackhq/nebula/udp" + "github.com/stretchr/testify/assert" +) + +// Test_emitStats_primesGauges verifies issue #907: certificate gauges should +// not read 0 between goroutine launch and the first ticker fire. The ticker +// interval here is set far longer than the test runtime so that any non-zero +// reading must come from the synchronous prime call, not a tick. +func Test_emitStats_primesGauges(t *testing.T) { + defer metrics.DefaultRegistry.UnregisterAll() + + l := test.NewLogger() + hostMap := newHostMap(l) + preferredRanges := []netip.Prefix{netip.MustParsePrefix("10.0.0.0/8")} + hostMap.preferredRanges.Store(&preferredRanges) + + notAfter := time.Now().Add(time.Hour) + cs := &CertState{ + initiatingVersion: cert.Version1, + privateKey: []byte{}, + v1Cert: &dummyCert{version: cert.Version1, notAfter: notAfter}, + v1Credential: nil, + } + + lh := newTestLighthouse() + ifce := &Interface{ + hostMap: hostMap, + inside: &overlaytest.NoopTun{}, + outside: &udp.NoopConn{}, + firewall: &Firewall{Conntrack: &FirewallConntrack{Conns: map[firewall.Packet]*conn{}}}, + lightHouse: lh, + pki: &PKI{}, + handshakeManager: NewHandshakeManager(l, hostMap, lh, &udp.NoopConn{}, defaultHandshakeConfig), + l: l, + } + ifce.pki.cs.Store(cs) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + ifce.emitStats(ctx, time.Hour) // ticker interval that will never fire + close(done) + }() + + // Give the goroutine a beat to run the synchronous prime call. This is + // generous: emit() is microseconds of work in practice. + assert.Eventually(t, func() bool { + return metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() > 0 + }, time.Second, 10*time.Millisecond, "certificate.ttl_seconds should be primed before first tick") + + ttl := metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() + assert.Greater(t, ttl, int64(0)) + assert.LessOrEqual(t, ttl, int64(3600)) + + assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.initiating_version", nil).Value()) + assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.max_version", nil).Value()) + + cancel() + <-done +} From 1e36ab8b3a8e418a6e4bec52e0bf7f9a6485b519 Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Thu, 7 May 2026 23:15:36 -0500 Subject: [PATCH 2/3] Lint --- interface_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interface_test.go b/interface_test.go index ba98504e..6240a1cc 100644 --- a/interface_test.go +++ b/interface_test.go @@ -62,7 +62,7 @@ func Test_emitStats_primesGauges(t *testing.T) { }, time.Second, 10*time.Millisecond, "certificate.ttl_seconds should be primed before first tick") ttl := metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() - assert.Greater(t, ttl, int64(0)) + assert.Positive(t, ttl, int64(0)) assert.LessOrEqual(t, ttl, int64(3600)) assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.initiating_version", nil).Value()) From 3a21ecd6c14db0713767813e0752434e0cb27059 Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Thu, 7 May 2026 23:36:21 -0500 Subject: [PATCH 3/3] Fix bugs --- interface_test.go | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/interface_test.go b/interface_test.go index 6240a1cc..b0a9d025 100644 --- a/interface_test.go +++ b/interface_test.go @@ -1,3 +1,5 @@ +//go:build linux || darwin + package nebula import ( @@ -13,12 +15,13 @@ import ( "github.com/slackhq/nebula/test" "github.com/slackhq/nebula/udp" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) -// Test_emitStats_primesGauges verifies issue #907: certificate gauges should -// not read 0 between goroutine launch and the first ticker fire. The ticker -// interval here is set far longer than the test runtime so that any non-zero -// reading must come from the synchronous prime call, not a tick. +// Test_emitStats_primesGauges covers issue #907: a Prometheus scrape that +// landed before the first ticker fire used to read 0 for the cert gauges. +// emitStats now primes the gauges before entering the ticker loop. We assert +// the gauge is zero before the first call and non-zero after. func Test_emitStats_primesGauges(t *testing.T) { defer metrics.DefaultRegistry.UnregisterAll() @@ -45,29 +48,26 @@ func Test_emitStats_primesGauges(t *testing.T) { pki: &PKI{}, handshakeManager: NewHandshakeManager(l, hostMap, lh, &udp.NoopConn{}, defaultHandshakeConfig), l: l, + // On linux, udp.NewUDPStatsEmitter indexes writers[0] and asserts to + // *udp.StdConn. A zero value works: getMemInfo sees a nil rawConn, + // returns an error, and the emitter falls through to a no-op. + writers: []udp.Conn{&udp.StdConn{}}, } ifce.pki.cs.Store(cs) + ttlGauge := metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil) + require.Zero(t, ttlGauge.Value(), "gauge should be zero before emitStats runs") + + // Pre-cancel the context so emitStats returns after priming the gauges + // without ever reading from ticker.C. The one hour interval is just a + // belt-and-suspenders, the test does not expect the ticker to fire. ctx, cancel := context.WithCancel(context.Background()) - done := make(chan struct{}) - go func() { - ifce.emitStats(ctx, time.Hour) // ticker interval that will never fire - close(done) - }() + cancel() + ifce.emitStats(ctx, time.Hour) - // Give the goroutine a beat to run the synchronous prime call. This is - // generous: emit() is microseconds of work in practice. - assert.Eventually(t, func() bool { - return metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() > 0 - }, time.Second, 10*time.Millisecond, "certificate.ttl_seconds should be primed before first tick") - - ttl := metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() - assert.Positive(t, ttl, int64(0)) + ttl := ttlGauge.Value() + assert.Positive(t, ttl, "ttl gauge should be primed by emitStats before its first tick") assert.LessOrEqual(t, ttl, int64(3600)) - assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.initiating_version", nil).Value()) assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.max_version", nil).Value()) - - cancel() - <-done }