From 0894a44599bd603c19e94ec1e1a6c459bd81bd26 Mon Sep 17 00:00:00 2001 From: Nate Brown Date: Thu, 7 May 2026 22:59:05 -0500 Subject: [PATCH] Prime some critical stats before the first scrape --- interface.go | 38 ++++++++++++++---------- interface_test.go | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 15 deletions(-) create mode 100644 interface_test.go diff --git a/interface.go b/interface.go index 5fedcdd3..32f5c2a6 100644 --- a/interface.go +++ b/interface.go @@ -491,26 +491,34 @@ func (f *Interface) emitStats(ctx context.Context, i time.Duration) { certInitiatingVersion := metrics.GetOrRegisterGauge("certificate.initiating_version", nil) certMaxVersion := metrics.GetOrRegisterGauge("certificate.max_version", nil) + emit := func() { + f.firewall.EmitStats() + f.handshakeManager.EmitStats() + udpStats() + + certState := f.pki.getCertState() + defaultCrt := certState.GetDefaultCertificate() + certExpirationGauge.Update(int64(defaultCrt.NotAfter().Sub(time.Now()) / time.Second)) + certInitiatingVersion.Update(int64(defaultCrt.Version())) + + // Report the max certificate version we are capable of using + if certState.v2Cert != nil { + certMaxVersion.Update(int64(certState.v2Cert.Version())) + } else { + certMaxVersion.Update(int64(certState.v1Cert.Version())) + } + } + + // Prime gauges so a Prometheus scrape that lands before the first tick + // sees real values instead of the zero defaults (issue #907). + emit() + for { select { case <-ctx.Done(): return case <-ticker.C: - f.firewall.EmitStats() - f.handshakeManager.EmitStats() - udpStats() - - certState := f.pki.getCertState() - defaultCrt := certState.GetDefaultCertificate() - certExpirationGauge.Update(int64(defaultCrt.NotAfter().Sub(time.Now()) / time.Second)) - certInitiatingVersion.Update(int64(defaultCrt.Version())) - - // Report the max certificate version we are capable of using - if certState.v2Cert != nil { - certMaxVersion.Update(int64(certState.v2Cert.Version())) - } else { - certMaxVersion.Update(int64(certState.v1Cert.Version())) - } + emit() } } } diff --git a/interface_test.go b/interface_test.go new file mode 100644 index 00000000..ba98504e --- /dev/null +++ b/interface_test.go @@ -0,0 +1,73 @@ +package nebula + +import ( + "context" + "net/netip" + "testing" + "time" + + "github.com/rcrowley/go-metrics" + "github.com/slackhq/nebula/cert" + "github.com/slackhq/nebula/firewall" + "github.com/slackhq/nebula/overlay/overlaytest" + "github.com/slackhq/nebula/test" + "github.com/slackhq/nebula/udp" + "github.com/stretchr/testify/assert" +) + +// Test_emitStats_primesGauges verifies issue #907: certificate gauges should +// not read 0 between goroutine launch and the first ticker fire. The ticker +// interval here is set far longer than the test runtime so that any non-zero +// reading must come from the synchronous prime call, not a tick. +func Test_emitStats_primesGauges(t *testing.T) { + defer metrics.DefaultRegistry.UnregisterAll() + + l := test.NewLogger() + hostMap := newHostMap(l) + preferredRanges := []netip.Prefix{netip.MustParsePrefix("10.0.0.0/8")} + hostMap.preferredRanges.Store(&preferredRanges) + + notAfter := time.Now().Add(time.Hour) + cs := &CertState{ + initiatingVersion: cert.Version1, + privateKey: []byte{}, + v1Cert: &dummyCert{version: cert.Version1, notAfter: notAfter}, + v1Credential: nil, + } + + lh := newTestLighthouse() + ifce := &Interface{ + hostMap: hostMap, + inside: &overlaytest.NoopTun{}, + outside: &udp.NoopConn{}, + firewall: &Firewall{Conntrack: &FirewallConntrack{Conns: map[firewall.Packet]*conn{}}}, + lightHouse: lh, + pki: &PKI{}, + handshakeManager: NewHandshakeManager(l, hostMap, lh, &udp.NoopConn{}, defaultHandshakeConfig), + l: l, + } + ifce.pki.cs.Store(cs) + + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + ifce.emitStats(ctx, time.Hour) // ticker interval that will never fire + close(done) + }() + + // Give the goroutine a beat to run the synchronous prime call. This is + // generous: emit() is microseconds of work in practice. + assert.Eventually(t, func() bool { + return metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() > 0 + }, time.Second, 10*time.Millisecond, "certificate.ttl_seconds should be primed before first tick") + + ttl := metrics.GetOrRegisterGauge("certificate.ttl_seconds", nil).Value() + assert.Greater(t, ttl, int64(0)) + assert.LessOrEqual(t, ttl, int64(3600)) + + assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.initiating_version", nil).Value()) + assert.Equal(t, int64(cert.Version1), metrics.GetOrRegisterGauge("certificate.max_version", nil).Value()) + + cancel() + <-done +}