From c4940f3b4397f8b250d0c76d6f41a10ec1f17611 Mon Sep 17 00:00:00 2001 From: Mikhail Mazurskiy <126021+ash2k@users.noreply.github.com> Date: Tue, 28 Mar 2023 11:05:44 +1100 Subject: [PATCH] TracerProvider allows calling Tracer() while it's shutting down (#3924) --- CHANGELOG.md | 5 +++ sdk/trace/provider.go | 69 ++++++++++++++++++++++++++++---------- sdk/trace/provider_test.go | 41 ++++++++++++++++++---- 3 files changed, 91 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 514dbd447..a7fe4cd32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] +### Fixed + +- `TracerProvider` allows calling `Tracer()` while it's shutting down. + It used to deadlock. (#3924) + ## [1.15.0-rc.2/0.38.0-rc.2] 2023-03-23 This is a release candidate for the v1.15.0/v0.38.0 release. diff --git a/sdk/trace/provider.go b/sdk/trace/provider.go index e152c63b6..324b686f4 100644 --- a/sdk/trace/provider.go +++ b/sdk/trace/provider.go @@ -76,7 +76,8 @@ type TracerProvider struct { mu sync.Mutex namedTracer map[instrumentation.Scope]*tracer spanProcessors atomic.Pointer[spanProcessorStates] - isShutdown bool + + isShutdown atomic.Bool // These fields are not protected by the lock mu. They are assumed to be // immutable after creation of the TracerProvider. @@ -136,10 +137,11 @@ func NewTracerProvider(opts ...TracerProviderOption) *TracerProvider { // // This method is safe to be called concurrently. func (p *TracerProvider) Tracer(name string, opts ...trace.TracerOption) trace.Tracer { + // This check happens before the mutex is acquired to avoid deadlocking if Tracer() is called from within Shutdown(). + if p.isShutdown.Load() { + return trace.NewNoopTracerProvider().Tracer(name, opts...) + } c := trace.NewTracerConfig(opts...) - - p.mu.Lock() - defer p.mu.Unlock() if name == "" { name = defaultTracerName } @@ -148,23 +150,46 @@ func (p *TracerProvider) Tracer(name string, opts ...trace.TracerOption) trace.T Version: c.InstrumentationVersion(), SchemaURL: c.SchemaURL(), } - t, ok := p.namedTracer[is] - if !ok { - t = &tracer{ - provider: p, - instrumentationScope: is, + + t, ok := func() (trace.Tracer, bool) { + p.mu.Lock() + defer p.mu.Unlock() + // Must check the flag after acquiring the mutex to avoid returning a valid tracer if Shutdown() ran + // after the first check above but before we acquired the mutex. + if p.isShutdown.Load() { + return trace.NewNoopTracerProvider().Tracer(name, opts...), true } - p.namedTracer[is] = t - global.Info("Tracer created", "name", name, "version", c.InstrumentationVersion(), "schemaURL", c.SchemaURL()) + t, ok := p.namedTracer[is] + if !ok { + t = &tracer{ + provider: p, + instrumentationScope: is, + } + p.namedTracer[is] = t + } + return t, ok + }() + if !ok { + // This code is outside the mutex to not hold the lock while calling third party logging code: + // - That code may do slow things like I/O, which would prolong the duration the lock is held, + // slowing down all tracing consumers. + // - Logging code may be instrumented with tracing and deadlock because it could try + // acquiring the same non-reentrant mutex. + global.Info("Tracer created", "name", name, "version", is.Version, "schemaURL", is.SchemaURL) } return t } // RegisterSpanProcessor adds the given SpanProcessor to the list of SpanProcessors. func (p *TracerProvider) RegisterSpanProcessor(sp SpanProcessor) { + // This check prevents calls during a shutdown. + if p.isShutdown.Load() { + return + } p.mu.Lock() defer p.mu.Unlock() - if p.isShutdown { + // This check prevents calls after a shutdown. + if p.isShutdown.Load() { return } newSPS := spanProcessorStates{} @@ -175,9 +200,14 @@ func (p *TracerProvider) RegisterSpanProcessor(sp SpanProcessor) { // UnregisterSpanProcessor removes the given SpanProcessor from the list of SpanProcessors. func (p *TracerProvider) UnregisterSpanProcessor(sp SpanProcessor) { + // This check prevents calls during a shutdown. + if p.isShutdown.Load() { + return + } p.mu.Lock() defer p.mu.Unlock() - if p.isShutdown { + // This check prevents calls after a shutdown. + if p.isShutdown.Load() { return } old := *(p.spanProcessors.Load()) @@ -236,13 +266,18 @@ func (p *TracerProvider) ForceFlush(ctx context.Context) error { // Shutdown shuts down TracerProvider. All registered span processors are shut down // in the order they were registered and any held computational resources are released. +// After Shutdown is called, all methods are no-ops. func (p *TracerProvider) Shutdown(ctx context.Context) error { - p.mu.Lock() - defer p.mu.Unlock() - if p.isShutdown { + // This check prevents deadlocks in case of recursive shutdown. + if p.isShutdown.Load() { + return nil + } + p.mu.Lock() + defer p.mu.Unlock() + // This check prevents calls after a shutdown has already been done concurrently. + if !p.isShutdown.CompareAndSwap(false, true) { // did toggle? return nil } - p.isShutdown = true spss := *(p.spanProcessors.Load()) var retErr error diff --git a/sdk/trace/provider_test.go b/sdk/trace/provider_test.go index 327212fc9..282cd16ee 100644 --- a/sdk/trace/provider_test.go +++ b/sdk/trace/provider_test.go @@ -46,11 +46,38 @@ func (t *basicSpanProcessor) ForceFlush(context.Context) error { return nil } +type shutdownSpanProcessor struct { + shutdown func(context.Context) error +} + +func (t *shutdownSpanProcessor) Shutdown(ctx context.Context) error { + return t.shutdown(ctx) +} + +func (t *shutdownSpanProcessor) OnStart(context.Context, ReadWriteSpan) {} +func (t *shutdownSpanProcessor) OnEnd(ReadOnlySpan) {} +func (t *shutdownSpanProcessor) ForceFlush(context.Context) error { + return nil +} + +func TestShutdownCallsTracerMethod(t *testing.T) { + stp := NewTracerProvider() + sp := &shutdownSpanProcessor{ + shutdown: func(ctx context.Context) error { + _ = stp.Tracer("abc") // must not deadlock + return nil + }, + } + stp.RegisterSpanProcessor(sp) + assert.NoError(t, stp.Shutdown(context.Background())) + assert.True(t, stp.isShutdown.Load()) +} + func TestForceFlushAndShutdownTraceProviderWithoutProcessor(t *testing.T) { stp := NewTracerProvider() assert.NoError(t, stp.ForceFlush(context.Background())) assert.NoError(t, stp.Shutdown(context.Background())) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) } func TestShutdownTraceProvider(t *testing.T) { @@ -61,7 +88,7 @@ func TestShutdownTraceProvider(t *testing.T) { assert.NoError(t, stp.ForceFlush(context.Background())) assert.True(t, sp.flushed, "error ForceFlush basicSpanProcessor") assert.NoError(t, stp.Shutdown(context.Background())) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) assert.True(t, sp.closed, "error Shutdown basicSpanProcessor") } @@ -76,7 +103,7 @@ func TestFailedProcessorShutdown(t *testing.T) { err := stp.Shutdown(context.Background()) assert.Error(t, err) assert.Equal(t, err, spErr) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) } func TestFailedProcessorsShutdown(t *testing.T) { @@ -97,7 +124,7 @@ func TestFailedProcessorsShutdown(t *testing.T) { assert.EqualError(t, err, "basic span processor shutdown failure1; basic span processor shutdown failure2") assert.True(t, sp1.closed) assert.True(t, sp2.closed) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) } func TestFailedProcessorShutdownInUnregister(t *testing.T) { @@ -114,7 +141,7 @@ func TestFailedProcessorShutdownInUnregister(t *testing.T) { err := stp.Shutdown(context.Background()) assert.NoError(t, err) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) } func TestSchemaURL(t *testing.T) { @@ -131,7 +158,7 @@ func TestRegisterAfterShutdownWithoutProcessors(t *testing.T) { stp := NewTracerProvider() err := stp.Shutdown(context.Background()) assert.NoError(t, err) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) sp := &basicSpanProcessor{} stp.RegisterSpanProcessor(sp) // no-op @@ -145,7 +172,7 @@ func TestRegisterAfterShutdownWithProcessors(t *testing.T) { stp.RegisterSpanProcessor(sp1) err := stp.Shutdown(context.Background()) assert.NoError(t, err) - assert.True(t, stp.isShutdown) + assert.True(t, stp.isShutdown.Load()) assert.Empty(t, stp.spanProcessors.Load()) sp2 := &basicSpanProcessor{}