1
0
mirror of https://github.com/open-telemetry/opentelemetry-go.git synced 2025-08-10 22:31:50 +02:00

Ensure context cancellation during metric pipeline produce does not corrupt data (#6914)

This fixes issue #6344, in which an attempted clean shutdown of the
metrics SDK using PeriodicReader's Shutdown method can produce and emit
wildly incorrect data point values that are orders of magnitude too
large.

The root of the issue is that the pipeline produce method changed in
this PR cannot safely return early, which was happening in the callback
loops that were checking for context cancellation. Early return is not
safe since callbacks and aggregations are tightly coupled in practice:
invoking callbacks without also invoking aggregations corrupts internal
data point value accounting.

The linked issue more concretely walks through the sequence of steps
that were causing this issue.

---------

Co-authored-by: Damien Mathieu <42@dmathieu.com>
This commit is contained in:
Alec Holmes
2025-06-20 00:46:30 -07:00
committed by GitHub
parent 1dc96449c6
commit 2da77b1195
3 changed files with 140 additions and 13 deletions

View File

@@ -121,6 +121,14 @@ func (p *pipeline) addMultiCallback(c multiCallback) (unregister func()) {
//
// This method is safe to call concurrently.
func (p *pipeline) produce(ctx context.Context, rm *metricdata.ResourceMetrics) error {
// Only check if context is already cancelled before starting, not inside or after callback loops.
// If this method returns after executing some callbacks but before running all aggregations,
// internal aggregation state can be corrupted and result in incorrect data returned
// by future produce calls.
if err := ctx.Err(); err != nil {
return err
}
p.Lock()
defer p.Unlock()
@@ -130,12 +138,6 @@ func (p *pipeline) produce(ctx context.Context, rm *metricdata.ResourceMetrics)
if e := c(ctx); e != nil {
err = errors.Join(err, e)
}
if err := ctx.Err(); err != nil {
rm.Resource = nil
clear(rm.ScopeMetrics) // Erase elements to let GC collect objects.
rm.ScopeMetrics = rm.ScopeMetrics[:0]
return err
}
}
for e := p.multiCallbacks.Front(); e != nil; e = e.Next() {
// TODO make the callbacks parallel. ( #3034 )
@@ -143,13 +145,6 @@ func (p *pipeline) produce(ctx context.Context, rm *metricdata.ResourceMetrics)
if e := f(ctx); e != nil {
err = errors.Join(err, e)
}
if err := ctx.Err(); err != nil {
// This means the context expired before we finished running callbacks.
rm.Resource = nil
clear(rm.ScopeMetrics) // Erase elements to let GC collect objects.
rm.ScopeMetrics = rm.ScopeMetrics[:0]
return err
}
}
rm.Resource = p.resource

View File

@@ -613,3 +613,134 @@ func TestPipelineWithMultipleReaders(t *testing.T) {
assert.Equal(t, int64(2), rm.ScopeMetrics[0].Metrics[0].Data.(metricdata.Sum[int64]).DataPoints[0].Value)
}
}
// TestPipelineProduceErrors tests the issue described in https://github.com/open-telemetry/opentelemetry-go/issues/6344.
// Earlier implementations of the pipeline produce method could corrupt metric data point state when the passed context
// was canceled during execution of callbacks. In this case, corroption was the result of some or all callbacks being
// invoked without instrument compAgg functions called.
func TestPipelineProduceErrors(t *testing.T) {
// Create a test pipeline with aggregations
pipeReader := NewManualReader()
pipe := newPipeline(nil, pipeReader, nil, exemplar.AlwaysOffFilter)
// Set up an observable with callbacks
var testObsID observableID[int64]
aggBuilder := aggregate.Builder[int64]{Temporality: metricdata.CumulativeTemporality}
measure, _ := aggBuilder.Sum(true)
pipe.addInt64Measure(testObsID, []aggregate.Measure[int64]{measure})
// Add an aggregation that just sets the data point value to the number of times the aggregation is invoked
aggCallCount := 0
inst := instrumentSync{
name: "test-metric",
description: "test description",
unit: "test unit",
compAgg: func(dest *metricdata.Aggregation) int {
aggCallCount++
*dest = metricdata.Sum[int64]{
Temporality: metricdata.CumulativeTemporality,
IsMonotonic: false,
DataPoints: []metricdata.DataPoint[int64]{{Value: int64(aggCallCount)}},
}
return aggCallCount
},
}
pipe.addSync(instrumentation.Scope{Name: "test"}, inst)
ctx, cancelCtx := context.WithCancel(context.Background())
var shouldCancelContext bool // When true, the second callback cancels ctx
var shouldReturnError bool // When true, the third callback returns an error
var callbackCounts [3]int
// Callback 1: cancels the context during execution but continues to populate data
pipe.callbacks = append(pipe.callbacks, func(ctx context.Context) error {
callbackCounts[0]++
for _, m := range pipe.int64Measures[testObsID] {
m(ctx, 123, *attribute.EmptySet())
}
return nil
})
// Callback 2: populates int64 observable data
pipe.callbacks = append(pipe.callbacks, func(ctx context.Context) error {
callbackCounts[1]++
if shouldCancelContext {
cancelCtx()
}
return nil
})
// Callback 3: return an error
pipe.callbacks = append(pipe.callbacks, func(ctx context.Context) error {
callbackCounts[2]++
if shouldReturnError {
return fmt.Errorf("test callback error")
}
return nil
})
assertMetrics := func(rm *metricdata.ResourceMetrics, expectVal int64) {
require.Len(t, rm.ScopeMetrics, 1)
require.Len(t, rm.ScopeMetrics[0].Metrics, 1)
metricdatatest.AssertEqual(t, metricdata.Metrics{
Name: inst.name,
Description: inst.description,
Unit: inst.unit,
Data: metricdata.Sum[int64]{
Temporality: metricdata.CumulativeTemporality,
IsMonotonic: false,
DataPoints: []metricdata.DataPoint[int64]{{Value: expectVal}},
},
}, rm.ScopeMetrics[0].Metrics[0], metricdatatest.IgnoreTimestamp())
}
t.Run("no errors", func(t *testing.T) {
var rm metricdata.ResourceMetrics
err := pipe.produce(ctx, &rm)
require.NoError(t, err)
assert.Equal(t, [3]int{1, 1, 1}, callbackCounts)
assert.Equal(t, 1, aggCallCount)
assertMetrics(&rm, 1)
})
t.Run("callback returns error", func(t *testing.T) {
shouldReturnError = true
var rm metricdata.ResourceMetrics
err := pipe.produce(ctx, &rm)
require.ErrorContains(t, err, "test callback error")
// Even though a callback returned an error, the agg function is still called
assert.Equal(t, [3]int{2, 2, 2}, callbackCounts)
assert.Equal(t, 2, aggCallCount)
assertMetrics(&rm, 2)
})
t.Run("context canceled during produce", func(t *testing.T) {
shouldCancelContext = true
var rm metricdata.ResourceMetrics
err := pipe.produce(ctx, &rm)
require.ErrorContains(t, err, "test callback error")
// Even though the context was canceled midway through invoking callbacks,
// all remaining callbacks and agg functions are still called
assert.Equal(t, [3]int{3, 3, 3}, callbackCounts)
assert.Equal(t, 3, aggCallCount)
})
t.Run("context already cancelled", func(t *testing.T) {
var output metricdata.ResourceMetrics
err := pipe.produce(ctx, &output)
require.ErrorIs(t, err, context.Canceled)
// No callbacks or agg functions are called since the context was canceled prior to invoking
// the produce method
assert.Equal(t, [3]int{3, 3, 3}, callbackCounts)
assert.Equal(t, 3, aggCallCount)
})
}