From 7e0af515e2102bea1a792375e3dbc02884cbc3d8 Mon Sep 17 00:00:00 2001 From: Santi Leira Date: Mon, 8 Jul 2024 10:01:04 -0300 Subject: [PATCH] baggage: Fix invalid percent-encoded octet sequences (#5528) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Goal Replace the percent encoded octet sequence with the replacement code point (U+FFFD) when it doesn't match the UTF-8 encoding schema. Issue: https://github.com/open-telemetry/opentelemetry-go/issues/5519 Current behavior: ``` package main import ( "fmt" "log" "unicode/utf8" "go.opentelemetry.io/otel/baggage" ) func main() { kv := "k=aa%ffcc" b, err := baggage.Parse(kv) if err != nil { log.Fatal(err) } val := b.Members()[0].Value() fmt.Println(len(val)) # 5 fmt.Println(utf8.ValidString(val)) # false } ``` Expected behavior: ``` package main import ( "fmt" "log" "unicode/utf8" "go.opentelemetry.io/otel/baggage" ) func main() { kv := "k=aa%ffcc" b, err := baggage.Parse(kv) if err != nil { log.Fatal(err) } val := b.Members()[0].Value() fmt.Println(len(val)) # 7 fmt.Println(utf8.ValidString(val)) # true } ``` ## Benchmark - `go test -bench=BenchmarkParse -count 20 > old.txt` ``` goos: darwin goarch: arm64 pkg: go.opentelemetry.io/otel/baggage BenchmarkParse-10 1548118 774.3 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1547653 786.0 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1544949 770.5 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1558972 770.2 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1554973 774.7 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1550200 779.6 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1545100 774.3 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1549634 777.5 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1552530 769.6 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1536499 855.0 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1552244 770.4 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1560225 767.4 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1562738 772.3 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1556679 838.9 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1562500 777.1 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1530901 836.5 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1000000 1372 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1534678 780.3 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1366180 822.4 ns/op 864 B/op 8 allocs/op BenchmarkParse-10 1539852 796.8 ns/op 864 B/op 8 allocs/op PASS ok go.opentelemetry.io/otel/baggage 40.839s ``` - `go test -bench=BenchmarkParse -count 20 > new.txt` ``` goos: darwin goarch: arm64 pkg: go.opentelemetry.io/otel/baggage BenchmarkParse-10 1355893 886.6 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1349192 883.1 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1363053 880.4 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1372404 875.7 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1359979 880.7 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1360497 874.7 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1375520 870.2 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1375268 882.8 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1361998 964.8 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1373461 961.5 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1378065 872.6 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1377290 879.0 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1362094 885.6 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1352175 915.9 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1364914 887.9 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1355782 890.5 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1361848 1245 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1163396 878.8 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1370886 916.6 ns/op 888 B/op 9 allocs/op BenchmarkParse-10 1340149 1175 ns/op 888 B/op 9 allocs/op PASS ok go.opentelemetry.io/otel/baggage 44.347s ``` - `benchstat old.txt new.txt` ``` goos: darwin goarch: arm64 pkg: go.opentelemetry.io/otel/baggage │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Parse-10 777.3n ± 3% 884.4n ± 4% +13.77% (p=0.000 n=20) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Parse-10 864.0 ± 0% 888.0 ± 0% +2.78% (p=0.000 n=20) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Parse-10 8.000 ± 0% 9.000 ± 0% +12.50% (p=0.000 n=20) ``` --------- Co-authored-by: Damien Mathieu <42@dmathieu.com> Co-authored-by: Robert Pająk --- CHANGELOG.md | 1 + baggage/baggage.go | 36 +++++++++++++++++++++--- baggage/baggage_test.go | 62 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c01e6998e..dcb4dec15 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Fix stale timestamps reported by the last-value aggregation. (#5517) - Indicate the `Exporter` in `go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp` must be created by the `New` method. (#5521) - Improved performance in all `{Bool,Int64,Float64,String}SliceValue` functions of `go.opentelemetry.io/attributes` by reducing the number of allocations. (#5549) +- Replace invalid percent-encoded octet sequences with replacement char in `go.opentelemetry.io/otel/baggage`. (#5528) ## [1.27.0/0.49.0/0.3.0] 2024-05-21 diff --git a/baggage/baggage.go b/baggage/baggage.go index c40c896cc..42cd46d84 100644 --- a/baggage/baggage.go +++ b/baggage/baggage.go @@ -294,19 +294,45 @@ func parseMember(member string) (Member, error) { return newInvalidMember(), fmt.Errorf("%w: %q", errInvalidKey, key) } - val := strings.TrimSpace(v) - if !validateValue(val) { + rawVal := strings.TrimSpace(v) + if !validateValue(rawVal) { return newInvalidMember(), fmt.Errorf("%w: %q", errInvalidValue, v) } // Decode a percent-encoded value. - value, err := url.PathUnescape(val) + unescapeVal, err := url.PathUnescape(rawVal) if err != nil { return newInvalidMember(), fmt.Errorf("%w: %w", errInvalidValue, err) } + + value := replaceInvalidUTF8Sequences(len(rawVal), unescapeVal) return Member{key: key, value: value, properties: props, hasData: true}, nil } +// replaceInvalidUTF8Sequences replaces invalid UTF-8 sequences with '�'. +func replaceInvalidUTF8Sequences(cap int, unescapeVal string) string { + if utf8.ValidString(unescapeVal) { + return unescapeVal + } + // W3C baggage spec: + // https://github.com/w3c/baggage/blob/8c215efbeebd3fa4b1aceb937a747e56444f22f3/baggage/HTTP_HEADER_FORMAT.md?plain=1#L69 + + var b strings.Builder + b.Grow(cap) + for i := 0; i < len(unescapeVal); { + r, size := utf8.DecodeRuneInString(unescapeVal[i:]) + if r == utf8.RuneError && size == 1 { + // Invalid UTF-8 sequence found, replace it with '�' + _, _ = b.WriteString("�") + } else { + _, _ = b.WriteRune(r) + } + i += size + } + + return b.String() +} + // validate ensures m conforms to the W3C Baggage specification. // A key must be an ASCII string, returning an error otherwise. func (m Member) validate() error { @@ -607,10 +633,12 @@ func parsePropertyInternal(s string) (p Property, ok bool) { } // Decode a percent-encoded value. - value, err := url.PathUnescape(s[valueStart:valueEnd]) + rawVal := s[valueStart:valueEnd] + unescapeVal, err := url.PathUnescape(rawVal) if err != nil { return } + value := replaceInvalidUTF8Sequences(len(rawVal), unescapeVal) ok = true p.key = s[keyStart:keyEnd] diff --git a/baggage/baggage_test.go b/baggage/baggage_test.go index 30150e4f9..146673846 100644 --- a/baggage/baggage_test.go +++ b/baggage/baggage_test.go @@ -9,6 +9,7 @@ import ( "slices" "strings" "testing" + "unicode/utf8" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -469,6 +470,18 @@ func TestBaggageParse(t *testing.T) { in: tooManyMembers, err: errMemberNumber, }, + { + name: "percent-encoded octet sequences do not match the UTF-8 encoding scheme", + in: "k=aa%ffcc;p=d%fff", + want: baggage.List{ + "k": { + Value: "aa�cc", + Properties: []baggage.Property{ + {Key: "p", Value: "d�f", HasValue: true}, + }, + }, + }, + }, } for _, tc := range testcases { @@ -480,6 +493,53 @@ func TestBaggageParse(t *testing.T) { } } +func TestBaggageParseValue(t *testing.T) { + testcases := []struct { + name string + in string + valueWant string + valueWantSize int + }{ + { + name: "percent encoded octet sequence matches UTF-8 encoding scheme", + in: "k=aa%26cc", + valueWant: "aa&cc", + valueWantSize: 5, + }, + { + name: "percent encoded octet sequence doesn't match UTF-8 encoding scheme", + in: "k=aa%ffcc", + valueWant: "aa�cc", + valueWantSize: 7, + }, + { + name: "multiple percent encoded octet sequences don't match UTF-8 encoding scheme", + in: "k=aa%ffcc%fedd%fa", + valueWant: "aa�cc�dd�", + valueWantSize: 15, + }, + { + name: "raw value", + in: "k=aacc", + valueWant: "aacc", + valueWantSize: 4, + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + b, err := Parse(tc.in) + assert.Empty(t, err) + + val := b.Members()[0].Value() + + assert.EqualValues(t, val, tc.valueWant) + assert.Equal(t, len(val), tc.valueWantSize) + assert.True(t, utf8.ValidString(val)) + }) + } +} + func TestBaggageString(t *testing.T) { testcases := []struct { name string @@ -979,7 +1039,7 @@ func BenchmarkParse(b *testing.B) { b.ReportAllocs() for i := 0; i < b.N; i++ { - benchBaggage, _ = Parse(`userId=alice,serverNode = DF28 , isProduction = false,hasProp=stuff;propKey;propWValue=value`) + benchBaggage, _ = Parse("userId=alice,serverNode = DF28 , isProduction = false,hasProp=stuff;propKey;propWValue=value, invalidUtf8=pr%ffo%ffp%fcValue") } }