From d3f706dc0fa0d5099907409fec163275c62662ae Mon Sep 17 00:00:00 2001 From: Nadia Santalla Date: Thu, 6 Nov 2025 11:19:17 +0100 Subject: [PATCH 1/2] feat: make `probe_duration_seconds` match `probe_script_duration_seconds` For the vast majority of check types, `probe_duration_seconds` is reported as the wall time that took to run the prober. This is accurate for the vast majority of cases, but there's one in particular where it is not: k6-backed checks that use external runners. External runners can take a certain time to process a request, which prior to this commit was counted as part of `probe_duration_seconds` while in fact it was not. This commit adds code that parses the `probe_script_duration_seconds` metric returned by k6 runners, and uses that as the duration of the check for both logs and `probe_duration_seconds`. --- internal/k6runner/http_test.go | 6 ++-- internal/k6runner/k6runner.go | 41 +++++++++++++++++------- internal/k6runner/k6runner_test.go | 4 ++- internal/k6runner/testdata/test.out | 1 + internal/prober/browser/browser.go | 6 ++-- internal/prober/multihttp/multihttp.go | 6 ++-- internal/prober/multihttp/script_test.go | 2 +- internal/prober/scripted/scripted.go | 5 ++- 8 files changed, 43 insertions(+), 28 deletions(-) diff --git a/internal/k6runner/http_test.go b/internal/k6runner/http_test.go index 3cedb2d4d..ed8a77f62 100644 --- a/internal/k6runner/http_test.go +++ b/internal/k6runner/http_test.go @@ -320,7 +320,7 @@ func TestScriptHTTPRun(t *testing.T) { zlogger = testhelper.Logger(t) ) - success, err := script.Run(ctx, registry, logger, zlogger, SecretStore{}) + success, _, err := script.Run(ctx, registry, logger, zlogger, SecretStore{}) require.Equal(t, tc.expectSuccess, success) require.Equal(t, tc.expectLogs, logger.buf.String()) if tc.expectErrorAs == nil { @@ -453,7 +453,7 @@ func TestHTTPProcessorRetries(t *testing.T) { logger testLogger zlogger = zerolog.New(io.Discard) ) - success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) + success, _, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) require.ErrorIs(t, err, tc.expectError) require.Equal(t, tc.expectError == nil, success) require.Equal(t, tc.expectRequests, requests.Load()) @@ -499,7 +499,7 @@ func TestHTTPProcessorRetries(t *testing.T) { logger testLogger zlogger = zerolog.New(io.Discard) ) - success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) + success, _, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) require.NoError(t, err) require.True(t, success) }) diff --git a/internal/k6runner/k6runner.go b/internal/k6runner/k6runner.go index 30b0fe046..465e954b5 100644 --- a/internal/k6runner/k6runner.go +++ b/internal/k6runner/k6runner.go @@ -7,6 +7,7 @@ import ( "fmt" "io" "strings" + "time" "github.com/go-logfmt/logfmt" smmmodel "github.com/grafana/synthetic-monitoring-agent/internal/model" @@ -147,7 +148,7 @@ var ( ErrFromRunner = errors.New("runner reported an error") ) -func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logger logger.Logger, internalLogger zerolog.Logger, secretStore SecretStore) (bool, error) { +func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logger logger.Logger, internalLogger zerolog.Logger, secretStore SecretStore) (bool, time.Duration, error) { k6runner := r.runner.WithLogger(&internalLogger) // TODO: This error message is okay to be Debug for local k6 execution, but should be Error for remote runners. @@ -156,7 +157,7 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge internalLogger.Debug(). Err(err). Msg("k6 script exited with error code") - return false, err + return false, 0, err } // If only one of Error and ErrorCode are non-empty, the proxy is misbehaving. @@ -164,7 +165,7 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge case result.Error == "" && result.ErrorCode != "": fallthrough case result.Error != "" && result.ErrorCode == "": - return false, fmt.Errorf( + return false, 0, fmt.Errorf( "%w: only one of error (%q) and errorCode (%q) is non-empty", ErrBuggyRunner, result.Error, result.ErrorCode, ) @@ -194,40 +195,41 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge internalLogger.Debug(). Err(err). Msg("cannot load logs to logger") - return false, err + return false, 0, err } var ( - collector sampleCollector - resultCollector checkResultCollector + collector sampleCollector + resultCollector checkResultCollector + durationCollector probeDurationCollector ) - if err := extractMetricSamples(result.Metrics, internalLogger, collector.process, resultCollector.process); err != nil { + if err := extractMetricSamples(result.Metrics, internalLogger, collector.process, resultCollector.process, durationCollector.process); err != nil { internalLogger.Debug(). Err(err). Msg("cannot extract metric samples") - return false, err + return false, 0, err } if err := registry.Register(&collector.collector); err != nil { internalLogger.Error(). Err(err). Msg("cannot register collector") - return false, err + return false, 0, err } // https://github.com/grafana/sm-k6-runner/blob/b811839d444a7e69fd056b0a4e6ccf7e914197f3/internal/mq/runner.go#L51 switch result.ErrorCode { case "": // No error, all good. - return true, nil + return true, durationCollector.duration, nil // TODO: Remove "user" from this list, which has been renamed to "aborted". case "timeout", "killed", "user", "failed", "aborted": // These are user errors. The probe failed, but we don't return an error. - return false, nil + return false, durationCollector.duration, nil default: // We got an "unknown" error, or some other code we do not recognize. Return it so we log it. - return false, fmt.Errorf("%w: %s: %s", ErrFromRunner, result.ErrorCode, result.Error) + return false, durationCollector.duration, fmt.Errorf("%w: %s: %s", ErrFromRunner, result.ErrorCode, result.Error) } } @@ -302,6 +304,21 @@ func (rc *checkResultCollector) process(mf *dto.MetricFamily, sample *model.Samp return nil } +type probeDurationCollector struct { + duration time.Duration +} + +func (dc *probeDurationCollector) process(_ *dto.MetricFamily, sample *model.Sample) error { + if sample.Metric[model.MetricNameLabel] != "probe_script_duration_seconds" { + return nil + } + + // TODO: Is there a better way to convert from float seconds to time? + dc.duration = time.Duration(sample.Value*1000) * time.Millisecond // This truncates to the nearest millisecond. + + return nil +} + func extractMetricSamples(metrics []byte, logger zerolog.Logger, processors ...sampleProcessorFunc) error { promDecoder := expfmt.NewDecoder(bytes.NewBuffer(metrics), expfmt.NewFormat(expfmt.TypeTextPlain)) decoderOpts := expfmt.DecodeOptions{Timestamp: model.Now()} diff --git a/internal/k6runner/k6runner_test.go b/internal/k6runner/k6runner_test.go index b0b361675..8dd69f916 100644 --- a/internal/k6runner/k6runner_test.go +++ b/internal/k6runner/k6runner_test.go @@ -9,6 +9,7 @@ import ( "sort" "strings" "testing" + "time" "github.com/grafana/synthetic-monitoring-agent/internal/model" "github.com/grafana/synthetic-monitoring-agent/internal/prober/logger" @@ -90,9 +91,10 @@ func TestScriptRun(t *testing.T) { // We already know tha parsing the metrics and the logs is working, so // we are only interested in verifying that the script runs without // errors. - success, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) + success, duration, err := processor.Run(ctx, registry, &logger, zlogger, SecretStore{}) require.NoError(t, err) require.True(t, success) + require.Equal(t, 500*time.Millisecond, duration) } func TestCheckInfoFromSM(t *testing.T) { diff --git a/internal/k6runner/testdata/test.out b/internal/k6runner/testdata/test.out index bc2896d1a..f47635393 100644 --- a/internal/k6runner/testdata/test.out +++ b/internal/k6runner/testdata/test.out @@ -1,3 +1,4 @@ probe_data_sent_bytes{scenario="default"} 0 probe_data_received_bytes{scenario="default"} 0 probe_iteration_duration_seconds{scenario="default"} 1.3029e-05 +probe_script_duration_seconds 0.500 diff --git a/internal/prober/browser/browser.go b/internal/prober/browser/browser.go index 35c51dd2f..da8416bad 100644 --- a/internal/prober/browser/browser.go +++ b/internal/prober/browser/browser.go @@ -66,20 +66,18 @@ func (p Prober) Name() string { func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.Registry, logger logger.Logger) (bool, float64) { secretStore, err := p.secretsRetriever(ctx) - if err != nil { p.logger.Error().Err(err).Msg("failed to retrieve secret store") return false, 0 } - success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) + success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) if err != nil { p.logger.Error().Err(err).Msg("running probe") return false, 0 } - // TODO(mem): implement custom duration extraction. - return success, 0 + return success, duration.Seconds() } // TODO(mem): This should probably be in the k6runner package. diff --git a/internal/prober/multihttp/multihttp.go b/internal/prober/multihttp/multihttp.go index 7563af18d..bf8264a1d 100644 --- a/internal/prober/multihttp/multihttp.go +++ b/internal/prober/multihttp/multihttp.go @@ -89,20 +89,18 @@ func (p Prober) Name() string { func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.Registry, logger logger.Logger) (bool, float64) { secretStore, err := p.secretsRetriever(ctx) - if err != nil { p.logger.Error().Err(err).Msg("running probe") return false, 0 } - success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) + success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) if err != nil { p.logger.Error().Err(err).Msg("running probe") return false, 0 } - // TODO(mem): implement custom duration extraction. - return success, 0 + return success, duration.Seconds() } // Overrides any user-provided headers with our own augmented values diff --git a/internal/prober/multihttp/script_test.go b/internal/prober/multihttp/script_test.go index d70c9f55e..90cbc0c02 100644 --- a/internal/prober/multihttp/script_test.go +++ b/internal/prober/multihttp/script_test.go @@ -869,7 +869,7 @@ func TestSettingsToScript(t *testing.T) { t.Log("Log entries:\n" + buf.String()) require.True(t, success) - require.Equal(t, float64(0), duration) + require.NotEqual(t, 0, duration) } func TestReplaceVariablesInString(t *testing.T) { diff --git a/internal/prober/scripted/scripted.go b/internal/prober/scripted/scripted.go index 44b7c7793..ddd913a90 100644 --- a/internal/prober/scripted/scripted.go +++ b/internal/prober/scripted/scripted.go @@ -78,14 +78,13 @@ func (p Prober) Probe(ctx context.Context, target string, registry *prometheus.R Bool("hasSecretStoreToken", secretStore.Token != ""). Msg("secret store retrieved for scripted probe") - success, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) + success, duration, err := p.processor.Run(ctx, registry, logger, p.logger, secretStore) if err != nil { p.logger.Error().Err(err).Msg("running probe") return false, 0 } - // TODO(mem): implement custom duration extraction. - return success, 0 + return success, duration.Seconds() } func newCredentialsRetriever(provider secrets.SecretProvider, tenantID model.GlobalID, logger zerolog.Logger) func(context.Context) (k6runner.SecretStore, error) { From c8e3b2944658332aef540c113c4c5a148c40bef0 Mon Sep 17 00:00:00 2001 From: Nadia Santalla Date: Fri, 7 Nov 2025 15:07:28 +0100 Subject: [PATCH 2/2] chore: use the obviously superior way to `Unseconds()` --- internal/k6runner/k6runner.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/internal/k6runner/k6runner.go b/internal/k6runner/k6runner.go index 465e954b5..90dfa8bb9 100644 --- a/internal/k6runner/k6runner.go +++ b/internal/k6runner/k6runner.go @@ -313,8 +313,7 @@ func (dc *probeDurationCollector) process(_ *dto.MetricFamily, sample *model.Sam return nil } - // TODO: Is there a better way to convert from float seconds to time? - dc.duration = time.Duration(sample.Value*1000) * time.Millisecond // This truncates to the nearest millisecond. + dc.duration = time.Duration(float64(sample.Value) * float64(time.Second)) return nil }