From 7c8e030d55d5a589eaf0c386c502a0108141440c Mon Sep 17 00:00:00 2001 From: Caleb Jasik Date: Wed, 28 Jan 2026 15:10:46 -0600 Subject: [PATCH 1/5] Add e2e/stats_test.go --- e2e/stats_test.go | 130 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 e2e/stats_test.go diff --git a/e2e/stats_test.go b/e2e/stats_test.go new file mode 100644 index 00000000..6a1bd9e2 --- /dev/null +++ b/e2e/stats_test.go @@ -0,0 +1,130 @@ +//go:build e2e_testing +// +build e2e_testing + +package e2e + +import ( + "io" + "net" + "net/http" + "strings" + "testing" + "time" + + "github.com/slackhq/nebula/cert" + "github.com/slackhq/nebula/cert_test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestPrometheusStats(t *testing.T) { + ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version1, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{}) + + // Create a server with Prometheus stats enabled + myControl, _, _, _ := newSimpleServer(cert.Version1, ca, caKey, "me", "10.128.0.1/24", m{ + "stats": m{ + "type": "prometheus", + "listen": "127.0.0.1:9090", + "path": "/metrics", + "interval": "1s", + "namespace": "nebula", + "subsystem": "e2e", + }, + }) + + // Start the server + myControl.Start() + defer myControl.Stop() + + // Fetch metrics from the Prometheus endpoint + resp, err := http.Get("http://127.0.0.1:9090/metrics") + require.NoError(t, err, "Failed to fetch metrics endpoint") + defer resp.Body.Close() + + assert.Equal(t, http.StatusOK, resp.StatusCode, "Metrics endpoint should return 200 OK") + + body, err := io.ReadAll(resp.Body) + require.NoError(t, err, "Failed to read metrics response") + + metricsOutput := string(body) + + // Verify that some expected metrics are present + assert.Contains(t, metricsOutput, "nebula_e2e_info", "Should contain version info metric") + assert.Contains(t, metricsOutput, "nebula_e2e_handshake_manager", "Should contain handshake manager metrics") + assert.Contains(t, metricsOutput, "nebula_e2e_firewall", "Should contain firewall metrics") + +} + +func TestGraphiteStats(t *testing.T) { + ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version1, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{}) + + // Create a mock Graphite server + listener, err := net.Listen("tcp", "127.0.0.1:2003") + require.NoError(t, err, "Failed to create mock Graphite listener") + defer listener.Close() + + // Channel to receive stats data + statsChan := make(chan string, 1) + + // Start accepting connections + go func() { + conn, err := listener.Accept() + if err != nil { + t.Logf("Accept error: %v", err) + return + } + defer conn.Close() + + // Set a read timeout + conn.SetReadDeadline(time.Now().Add(5 * time.Second)) + + // Read all data sent by the stats system + data, err := io.ReadAll(conn) + if err != nil { + t.Logf("Read error: %v", err) + return + } + + statsChan <- string(data) + }() + + // Create a server with Graphite stats configured + myControl, _, _, _ := newSimpleServer(cert.Version1, ca, caKey, "me", "10.128.0.1/24", m{ + "stats": m{ + "type": "graphite", + "protocol": "tcp", + "host": "127.0.0.1:2003", + "interval": "1s", + "prefix": "nebula.test", + }, + }) + + // Start the server + myControl.Start() + defer myControl.Stop() + + // Wait for stats to be sent + select { + case statsData := <-statsChan: + // Verify the data is in Graphite plaintext format: "metric.path value timestamp\n" + assert.NotEmpty(t, statsData, "Should receive stats data") + + // Check for expected metrics with the configured prefix + assert.Contains(t, statsData, "nebula.test.", "Should contain configured prefix") + assert.Contains(t, statsData, "runtime.NumGoroutine", "Should contain runtime metrics") + assert.Contains(t, statsData, "runtime.MemStats.Alloc", "Should contain memory stats") + + // Verify format: each line should have metric, value, and timestamp + lines := strings.Split(strings.TrimSpace(statsData), "\n") + assert.Greater(t, len(lines), 0, "Should have at least one metric line") + + // Check first line format + if len(lines) > 0 { + parts := strings.Fields(lines[0]) + assert.Equal(t, 3, len(parts), "Each metric line should have 3 parts: metric value timestamp") + } + + case <-time.After(3 * time.Second): + t.Fatal("Timeout waiting for stats to be sent to Graphite endpoint") + } +} From 9e73ae3b75f842aad4d89536a33564c9a06dc2cc Mon Sep 17 00:00:00 2001 From: Caleb Jasik Date: Wed, 28 Jan 2026 15:15:22 -0600 Subject: [PATCH 2/5] Make the metrics fetch in prometheus test cancellable --- e2e/stats_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/e2e/stats_test.go b/e2e/stats_test.go index 6a1bd9e2..75607364 100644 --- a/e2e/stats_test.go +++ b/e2e/stats_test.go @@ -36,8 +36,12 @@ func TestPrometheusStats(t *testing.T) { myControl.Start() defer myControl.Stop() - // Fetch metrics from the Prometheus endpoint - resp, err := http.Get("http://127.0.0.1:9090/metrics") + // Fetch metrics from the Prometheus endpoint with context + ctx := t.Context() + req, err := http.NewRequestWithContext(ctx, "GET", "http://127.0.0.1:9090/metrics", nil) + require.NoError(t, err, "Failed to create request") + + resp, err := http.DefaultClient.Do(req) require.NoError(t, err, "Failed to fetch metrics endpoint") defer resp.Body.Close() From 7c129f72f57e2349b7af82778f7fe3ed63fe6f4e Mon Sep 17 00:00:00 2001 From: Caleb Jasik Date: Wed, 28 Jan 2026 15:22:08 -0600 Subject: [PATCH 3/5] Make the graphite test more cancellable --- e2e/stats_test.go | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/e2e/stats_test.go b/e2e/stats_test.go index 75607364..e061d9e3 100644 --- a/e2e/stats_test.go +++ b/e2e/stats_test.go @@ -62,29 +62,46 @@ func TestPrometheusStats(t *testing.T) { func TestGraphiteStats(t *testing.T) { ca, _, caKey, _ := cert_test.NewTestCaCert(cert.Version1, cert.Curve_CURVE25519, time.Now(), time.Now().Add(10*time.Minute), nil, nil, []string{}) + ctx := t.Context() + // Create a mock Graphite server listener, err := net.Listen("tcp", "127.0.0.1:2003") require.NoError(t, err, "Failed to create mock Graphite listener") - defer listener.Close() + + // Channel to signal goroutine completion + done := make(chan struct{}) // Channel to receive stats data statsChan := make(chan string, 1) + // Close listener when context is cancelled + go func() { + <-ctx.Done() + listener.Close() + }() + // Start accepting connections go func() { + defer close(done) + conn, err := listener.Accept() if err != nil { + if ctx.Err() != nil { + // Context was cancelled, this is expected + return + } t.Logf("Accept error: %v", err) return } defer conn.Close() - // Set a read timeout - conn.SetReadDeadline(time.Now().Add(5 * time.Second)) - // Read all data sent by the stats system data, err := io.ReadAll(conn) if err != nil { + if ctx.Err() != nil { + // Context was cancelled + return + } t.Logf("Read error: %v", err) return } @@ -92,6 +109,12 @@ func TestGraphiteStats(t *testing.T) { statsChan <- string(data) }() + // Ensure goroutine completes before test exits + t.Cleanup(func() { + listener.Close() + <-done + }) + // Create a server with Graphite stats configured myControl, _, _, _ := newSimpleServer(cert.Version1, ca, caKey, "me", "10.128.0.1/24", m{ "stats": m{ From b11125234e9ff13b079a4db2ed412f03257c32fc Mon Sep 17 00:00:00 2001 From: Caleb Jasik Date: Wed, 28 Jan 2026 15:41:28 -0600 Subject: [PATCH 4/5] Simplify graphite test assertions --- e2e/stats_test.go | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/e2e/stats_test.go b/e2e/stats_test.go index e061d9e3..4addcd81 100644 --- a/e2e/stats_test.go +++ b/e2e/stats_test.go @@ -7,7 +7,6 @@ import ( "io" "net" "net/http" - "strings" "testing" "time" @@ -133,24 +132,11 @@ func TestGraphiteStats(t *testing.T) { // Wait for stats to be sent select { case statsData := <-statsChan: - // Verify the data is in Graphite plaintext format: "metric.path value timestamp\n" - assert.NotEmpty(t, statsData, "Should receive stats data") - // Check for expected metrics with the configured prefix assert.Contains(t, statsData, "nebula.test.", "Should contain configured prefix") assert.Contains(t, statsData, "runtime.NumGoroutine", "Should contain runtime metrics") assert.Contains(t, statsData, "runtime.MemStats.Alloc", "Should contain memory stats") - // Verify format: each line should have metric, value, and timestamp - lines := strings.Split(strings.TrimSpace(statsData), "\n") - assert.Greater(t, len(lines), 0, "Should have at least one metric line") - - // Check first line format - if len(lines) > 0 { - parts := strings.Fields(lines[0]) - assert.Equal(t, 3, len(parts), "Each metric line should have 3 parts: metric value timestamp") - } - case <-time.After(3 * time.Second): t.Fatal("Timeout waiting for stats to be sent to Graphite endpoint") } From 2f0cc6ff34bfcc924af3dd526089264b7daf9652 Mon Sep 17 00:00:00 2001 From: Caleb Jasik Date: Wed, 28 Jan 2026 15:51:28 -0600 Subject: [PATCH 5/5] Add retries to the prometheus test --- e2e/stats_test.go | 45 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/e2e/stats_test.go b/e2e/stats_test.go index 4addcd81..dce50e20 100644 --- a/e2e/stats_test.go +++ b/e2e/stats_test.go @@ -35,19 +35,46 @@ func TestPrometheusStats(t *testing.T) { myControl.Start() defer myControl.Stop() - // Fetch metrics from the Prometheus endpoint with context + // Fetch metrics from the Prometheus endpoint with context and retries ctx := t.Context() - req, err := http.NewRequestWithContext(ctx, "GET", "http://127.0.0.1:9090/metrics", nil) - require.NoError(t, err, "Failed to create request") + var resp *http.Response + var body []byte - resp, err := http.DefaultClient.Do(req) - require.NoError(t, err, "Failed to fetch metrics endpoint") - defer resp.Body.Close() + // Retry fetching metrics for up to 3 seconds + timeout := time.After(3 * time.Second) + ticker := time.NewTicker(100 * time.Millisecond) + defer ticker.Stop() - assert.Equal(t, http.StatusOK, resp.StatusCode, "Metrics endpoint should return 200 OK") + for { + select { + case <-ctx.Done(): + t.Fatal("Context cancelled while waiting for metrics endpoint") + case <-timeout: + t.Fatal("Timeout waiting for metrics endpoint to become available") + case <-ticker.C: + req, err := http.NewRequestWithContext(ctx, "GET", "http://127.0.0.1:9090/metrics", nil) + if err != nil { + continue + } - body, err := io.ReadAll(resp.Body) - require.NoError(t, err, "Failed to read metrics response") + resp, err = http.DefaultClient.Do(req) + if err != nil { + continue + } + + if resp.StatusCode == http.StatusOK { + body, err = io.ReadAll(resp.Body) + resp.Body.Close() + if err == nil { + goto success + } + } else { + resp.Body.Close() + } + } + } + +success: metricsOutput := string(body)