Replace fnv with xxhash (#7497)

**Objective**: - Performance comparison between fnv and xxhash in terms of ops/sec, allocations and collisions - Implement xxhash according to first objective **Changes**: - fnv is replaced by xxhash. Perform stats: - **Collision**: No collision upto 100M - **Allocations**: Same in both cases - **Ops/sec**: xxhash performed better in cases with medium to large strings **Benchmarks**: ``` benchstat old.txt new.txt goos: darwin goarch: arm64 pkg: go.opentelemetry.io/otel/attribute cpu: Apple M2 │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ NewSet-8 205.5n ± 1% 229.4n ± 1% +11.61% (p=0.002 n=6) NewSetSmallStrings-8 160.5n ± 1% 169.0n ± 5% +5.26% (p=0.002 n=6) NewSetMediumStrings-8 263.8n ± 6% 185.0n ± 1% -29.89% (p=0.002 n=6) NewSetLargeStrings-8 426.4n ± 9% 210.2n ± 1% -50.72% (p=0.002 n=6) NewSetVeryLargeStrings-8 1012.5n ± 7% 238.7n ± 2% -76.43% (p=0.002 n=6) NewSetHugeStrings-8 3622.0n ± 8% 397.1n ± 1% -89.04% (p=0.002 n=6) geomean 488.6n 228.6n -53.21% │ old.txt │ new.txt │ │ B/op │ B/op vs base │ NewSet-8 448.0 ± 0% 448.0 ± 0% ~ (p=1.000 n=6) ¹ NewSetSmallStrings-8 320.0 ± 0% 320.0 ± 0% ~ (p=1.000 n=6) ¹ NewSetMediumStrings-8 320.0 ± 0% 320.0 ± 0% ~ (p=1.000 n=6) ¹ NewSetLargeStrings-8 320.0 ± 0% 320.0 ± 0% ~ (p=1.000 n=6) ¹ NewSetVeryLargeStrings-8 320.0 ± 0% 320.0 ± 0% ~ (p=1.000 n=6) ¹ NewSetHugeStrings-8 320.0 ± 0% 320.0 ± 0% ~ (p=1.000 n=6) ¹ geomean 338.5 338.5 +0.00% ¹ all samples are equal │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ NewSet-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ NewSetSmallStrings-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ NewSetMediumStrings-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ NewSetLargeStrings-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ NewSetVeryLargeStrings-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ NewSetHugeStrings-8 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=6) ¹ geomean 1.000 1.000 +0.00% ¹ all samples are equal ``` Previous implementation for reference: https://github.com/open-telemetry/opentelemetry-go/blame/d0483a7c89d936dcced557fb523465daeac16967/CHANGELOG.md#L16 --------- Co-authored-by: Robert Pająk <pellared@hotmail.com>
2026-06-03 18:35:08 +02:00 · 2025-11-19 15:36:20 +05:30
parent 98eb065c75
commit 49292857b7
55 changed files with 403 additions and 195 deletions
@@ -7,7 +7,7 @@ import (
 	"fmt"
 	"reflect"

-	"go.opentelemetry.io/otel/attribute/internal/fnv"
+	"go.opentelemetry.io/otel/attribute/internal/xxhash"
 )

 // Type identifiers. These identifiers are hashed before the value of the
@@ -17,7 +17,7 @@ import (
 //
 // These are all 8 byte length strings converted to a uint64 representation. A
 // uint64 is used instead of the string directly as an optimization, it avoids
-// the for loop in [fnv] which adds minor overhead.
+// the for loop in [xxhash] which adds minor overhead.
 const (
 	boolID         uint64 = 7953749933313450591 // "_boolean" (little endian)
 	int64ID        uint64 = 7592915492740740150 // "64_bit_i" (little endian)
@@ -29,17 +29,17 @@ const (
 	stringSliceID  uint64 = 7453010373645655387 // "[]string" (little endian)
 )

-// hashKVs returns a new FNV-1a hash of kvs.
-func hashKVs(kvs []KeyValue) fnv.Hash {
-	h := fnv.New()
+// hashKVs returns a new xxHash64 hash of kvs.
+func hashKVs(kvs []KeyValue) uint64 {
+	h := xxhash.New()
 	for _, kv := range kvs {
 		h = hashKV(h, kv)
 	}
-	return h
+	return h.Sum64()
 }

-// hashKV returns the FNV-1a hash of kv with h as the base.
-func hashKV(h fnv.Hash, kv KeyValue) fnv.Hash {
+// hashKV returns the xxHash64 hash of kv with h as the base.
+func hashKV(h xxhash.Hash, kv KeyValue) xxhash.Hash {
 	h = h.String(string(kv.Key))

 	switch kv.Value.Type() {
@@ -11,8 +11,6 @@ import (
 	"slices"
 	"strings"
 	"testing"
-
-	"go.opentelemetry.io/otel/attribute/internal/fnv"
 )

 // keyVals is all the KeyValue generators that are used for testing. This is
@@ -42,7 +40,7 @@ var keyVals = []func(string) KeyValue{

 func TestHashKVsEquality(t *testing.T) {
 	type testcase struct {
-		hash fnv.Hash
+		hash uint64
 		kvs  []KeyValue
 	}

@@ -105,7 +103,7 @@ func TestHashKVsEquality(t *testing.T) {
 type msg struct {
 	cmp      string
 	i, j     int
-	hI, hJ   fnv.Hash
+	hI, hJ   uint64
 	kvI, kvJ []KeyValue
 }

@@ -1,76 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package fnv provides an efficient and allocation free implementation of the
-// FNV-1a, non-cryptographic hash functions created by Glenn Fowler, Landon
-// Curt Noll, and Phong Vo. See
-// https://en.wikipedia.org/wiki/Fowler-Noll-Vo_hash_function.
-//
-// This implementation is provided as an alternative to "hash/fnv". The
-// built-in implementation requires two allocations per Write for a string (one
-// for the hash pointer and the other to convert a string to a []byte). This
-// implementation is more efficientient and does not require any allocations.
-package fnv // import "go.opentelemetry.io/otel/attribute/internal/fnv"
-
-import (
-	"math"
-)
-
-// Taken from "hash/fnv". Verified at:
-//
-//   - https://datatracker.ietf.org/doc/html/draft-eastlake-fnv-17.html
-//   - http://www.isthe.com/chongo/tech/comp/fnv/index.html#FNV-param
-const (
-	offset64 = 14695981039346656037
-	prime64  = 1099511628211
-)
-
-// Hash is an FNV-1a hash with appropriate hashing functions for methods.
-type Hash uint64
-
-// New returns a new initialized 64-bit FNV-1a Hash. Its value is laid out in
-// big-endian byte order.
-func New() Hash {
-	return offset64
-}
-
-func (h Hash) Uint64(val uint64) Hash {
-	v := uint64(h)
-	v = (v ^ ((val >> 56) & 0xFF)) * prime64
-	v = (v ^ ((val >> 48) & 0xFF)) * prime64
-	v = (v ^ ((val >> 40) & 0xFF)) * prime64
-	v = (v ^ ((val >> 32) & 0xFF)) * prime64
-	v = (v ^ ((val >> 24) & 0xFF)) * prime64
-	v = (v ^ ((val >> 16) & 0xFF)) * prime64
-	v = (v ^ ((val >> 8) & 0xFF)) * prime64
-	v = (v ^ ((val >> 0) & 0xFF)) * prime64
-	return Hash(v)
-}
-
-func (h Hash) Bool(val bool) Hash { // nolint:revive  // val is not a flag.
-	if val {
-		return h.Uint64(1)
-	}
-	return h.Uint64(0)
-}
-
-func (h Hash) Float64(val float64) Hash {
-	return h.Uint64(math.Float64bits(val))
-}
-
-func (h Hash) Int64(val int64) Hash {
-	return h.Uint64(uint64(val)) // nolint:gosec // overflow doesn't matter since we are hashing.
-}
-
-func (h Hash) String(val string) Hash {
-	v := uint64(h)
-	for i := 0; i < len(val); i++ {
-		v ^= uint64(val[i])
-		v *= prime64
-	}
-	return Hash(v)
-}
@@ -1,98 +0,0 @@
-// Copyright The OpenTelemetry Authors
-// SPDX-License-Identifier: Apache-2.0
-
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package fnv
-
-import (
-	"encoding/binary"
-	"hash/fnv"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestStringHashCorrectness(t *testing.T) {
-	input := []string{"", "a", "ab", "abc", "世界"}
-
-	refH := fnv.New64a()
-	for _, in := range input {
-		h := New()
-		got := h.String(in)
-
-		refH.Reset()
-		n, err := refH.Write([]byte(in))
-		require.NoError(t, err)
-		require.Equalf(t, len(in), n, "wrote only %d out of %d bytes", n, len(in))
-		want := refH.Sum64()
-
-		assert.Equal(t, want, uint64(got), in)
-	}
-}
-
-func TestUint64HashCorrectness(t *testing.T) {
-	input := []uint64{0, 10, 312984238623, 1024}
-
-	buf := make([]byte, 8)
-	refH := fnv.New64a()
-	for _, in := range input {
-		h := New()
-		got := h.Uint64(in)
-
-		refH.Reset()
-		binary.BigEndian.PutUint64(buf, in)
-		n, err := refH.Write(buf)
-		require.NoError(t, err)
-		require.Equalf(t, 8, n, "wrote only %d out of 8 bytes", n)
-		want := refH.Sum64()
-
-		assert.Equal(t, want, uint64(got), in)
-	}
-}
-
-func TestIntegrity(t *testing.T) {
-	data := []byte{'1', '2', 3, 4, 5, 6, 7, 8, 9, 10}
-	h0 := New()
-	want := h0.String(string(data))
-
-	h1 := New()
-	got := h1.String(string(data[:2]))
-	num := binary.BigEndian.Uint64(data[2:])
-	got = got.Uint64(num)
-
-	assert.Equal(t, want, got)
-}
-
-var result Hash
-
-func BenchmarkStringKB(b *testing.B) {
-	b.SetBytes(1024)
-	data := make([]byte, 1024)
-	for i := range data {
-		data[i] = byte(i)
-	}
-	s := string(data)
-	h := New()
-
-	b.ReportAllocs()
-	b.ResetTimer()
-	for range b.N {
-		result = h.String(s)
-	}
-}
-
-func BenchmarkUint64KB(b *testing.B) {
-	b.SetBytes(8)
-	i := uint64(192386739218721)
-	h := New()
-
-	b.ReportAllocs()
-	b.ResetTimer()
-	for range b.N {
-		result = h.Uint64(i)
-	}
-}
@@ -0,0 +1,64 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+// Package xxhash provides a wrapper around the xxhash library for attribute hashing.
+package xxhash // import "go.opentelemetry.io/otel/attribute/internal/xxhash"
+
+import (
+	"encoding/binary"
+	"math"
+
+	"github.com/cespare/xxhash/v2"
+)
+
+// Hash wraps xxhash.Digest to provide an API friendly for hashing attribute values.
+type Hash struct {
+	d *xxhash.Digest
+}
+
+// New returns a new initialized xxHash64 hasher.
+func New() Hash {
+	return Hash{d: xxhash.New()}
+}
+
+func (h Hash) Uint64(val uint64) Hash {
+	var buf [8]byte
+	binary.LittleEndian.PutUint64(buf[:], val)
+	// errors from Write are always nil for xxhash
+	// if it returns an err then panic
+	_, err := h.d.Write(buf[:])
+	if err != nil {
+		panic("xxhash write of uint64 failed: " + err.Error())
+	}
+	return h
+}
+
+func (h Hash) Bool(val bool) Hash { // nolint:revive // This is a hashing function.
+	if val {
+		return h.Uint64(1)
+	}
+	return h.Uint64(0)
+}
+
+func (h Hash) Float64(val float64) Hash {
+	return h.Uint64(math.Float64bits(val))
+}
+
+func (h Hash) Int64(val int64) Hash {
+	return h.Uint64(uint64(val)) // nolint:gosec // Overflow doesn't matter since we are hashing.
+}
+
+func (h Hash) String(val string) Hash {
+	// errors from WriteString are always nil for xxhash
+	// if it returns an err then panic
+	_, err := h.d.WriteString(val)
+	if err != nil {
+		panic("xxhash write of string failed: " + err.Error())
+	}
+	return h
+}
+
+// Sum64 returns the current hash value.
+func (h Hash) Sum64() uint64 {
+	return h.d.Sum64()
+}
@@ -0,0 +1,197 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package xxhash
+
+import (
+	"encoding/binary"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestIntegrity(t *testing.T) {
+	data := []byte{'1', '2', 3, 4, 5, 6, 7, 8, 9, 10}
+	h0 := New()
+	want := h0.String(string(data))
+
+	h1 := New()
+	got := h1.String(string(data[:2]))
+	num := binary.LittleEndian.Uint64(data[2:])
+	got = got.Uint64(num)
+
+	assert.Equal(t, want.Sum64(), got.Sum64())
+}
+
+func TestNew(t *testing.T) {
+	h1 := New()
+	h2 := New()
+
+	// Test that the underlying digest is properly initialized.
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("New() should return consistent initial value: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+}
+
+func TestUint64(t *testing.T) {
+	h1 := New().Uint64(42)
+	h2 := New().Uint64(42)
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("Uint64() should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	h3 := New().Uint64(43)
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different inputs should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func TestBool(t *testing.T) {
+	h1 := New().Bool(true)
+	h2 := New().Bool(true)
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("Bool() should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	h3 := New().Bool(false)
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different bool values should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func TestFloat64(t *testing.T) {
+	h1 := New().Float64(3.14)
+	h2 := New().Float64(3.14)
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("Float64() should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	h3 := New().Float64(2.71)
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different float values should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func TestInt64(t *testing.T) {
+	h1 := New().Int64(42)
+	h2 := New().Int64(42)
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("Int64() should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	h3 := New().Int64(43)
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different int64 values should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func TestString(t *testing.T) {
+	h1 := New().String("hello")
+	h2 := New().String("hello")
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("String() should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	h3 := New().String("world")
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different strings should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func TestChaining(t *testing.T) {
+	// Test that methods can be chained and produce different results
+	h1 := New().String("key").Uint64(42).Bool(true)
+	h2 := New().String("key").Uint64(42).Bool(true)
+	h3 := New().String("key").Uint64(43).Bool(true)
+
+	if h1.Sum64() != h2.Sum64() {
+		t.Errorf("Chained operations should be deterministic: %d != %d", h1.Sum64(), h2.Sum64())
+	}
+
+	if h1.Sum64() == h3.Sum64() {
+		t.Errorf("Different chained operations should produce different hashes: %d == %d", h1.Sum64(), h3.Sum64())
+	}
+}
+
+func BenchmarkStringKB(b *testing.B) {
+	b.SetBytes(1024)
+	data := make([]byte, 1024)
+	for i := range data {
+		data[i] = byte(i)
+	}
+	s := string(data)
+	h := New()
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for b.Loop() {
+		h.String(s)
+	}
+}
+
+func BenchmarkUint64KB(b *testing.B) {
+	b.SetBytes(8)
+	i := uint64(192386739218721)
+	h := New()
+
+	b.ReportAllocs()
+	b.ResetTimer()
+	for b.Loop() {
+		h.Uint64(i)
+	}
+}
+
+func BenchmarkUint64(b *testing.B) {
+	h := New()
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		h = h.Uint64(uint64(i))
+	}
+}
+
+func BenchmarkString(b *testing.B) {
+	h := New()
+	str := "benchmark_string_value"
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		h = h.String(str)
+	}
+}
+
+func BenchmarkBool(b *testing.B) {
+	h := New()
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		h = h.Bool(i%2 == 0)
+	}
+}
+
+func BenchmarkFloat64(b *testing.B) {
+	h := New()
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		h = h.Float64(float64(i) * 3.14159)
+	}
+}
+
+func BenchmarkInt64(b *testing.B) {
+	h := New()
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		h = h.Int64(int64(i))
+	}
+}
+
+func BenchmarkSum64(b *testing.B) {
+	h := New().String("key").Uint64(42).Bool(true)
+	b.ResetTimer()
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = h.Sum64()
+	}
+}
@@ -10,7 +10,7 @@ import (
 	"slices"
 	"sort"

-	"go.opentelemetry.io/otel/attribute/internal/fnv"
+	"go.opentelemetry.io/otel/attribute/internal/xxhash"
 )

 type (
@@ -28,7 +28,7 @@ type (
 	// instead of a Set directly. Set has relatively poor performance when used
 	// as a map key compared to Distinct.
 	Set struct {
-		hash fnv.Hash
+		hash uint64
 		data any
 	}

@@ -37,7 +37,7 @@ type (
 	// Distinct should be used as a map key instead of a Set for to provide better
 	// performance for map operations.
 	Distinct struct {
-		hash fnv.Hash
+		hash uint64
 	}

 	// Sortable implements sort.Interface, used for sorting KeyValue.
@@ -60,18 +60,21 @@ var (
 	// keyValueType is used in computeDistinctReflect.
 	keyValueType = reflect.TypeOf(KeyValue{})

+	// emptyHash is the hash of an empty set.
+	emptyHash = xxhash.New().Sum64()
+
 	// userDefinedEmptySet is an empty set. It was mistakenly exposed to users
 	// as something they can assign to, so it must remain addressable and
 	// mutable.
 	//
 	// This is kept for backwards compatibility, but should not be used in new code.
 	userDefinedEmptySet = &Set{
-		hash: fnv.New(),
+		hash: emptyHash,
 		data: [0]KeyValue{},
 	}

 	emptySet = Set{
-		hash: fnv.New(),
+		hash: emptyHash,
 		data: [0]KeyValue{},
 	}
 )
@@ -528,8 +528,6 @@ func BenchmarkFiltering(b *testing.B) {
 	b.Run("AllDropped", benchFn(func(attribute.KeyValue) bool { return false }))
 }

-var sinkSet attribute.Set
-
 func BenchmarkNewSet(b *testing.B) {
 	attrs := []attribute.KeyValue{
 		attribute.String("B1", "2"),
@@ -542,7 +540,59 @@ func BenchmarkNewSet(b *testing.B) {
 	}
 	b.ReportAllocs()
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
-		sinkSet = attribute.NewSet(attrs...)
+	for b.Loop() {
+		attribute.NewSet(attrs...)
+	}
+}
+
+// generateStringAttrsWithSize creates 5 string attributes with specified key and value lengths.
+func generateStringAttrsWithSize(keyLen, valueLen int) []attribute.KeyValue {
+	// Generate base strings of specified lengths
+	keyBase := ""
+	valueBase := ""
+
+	// Build key base string
+	for i := 0; i < keyLen; i++ {
+		keyBase += string(rune('a' + i%26))
+	}
+
+	// Build value base string
+	for i := 0; i < valueLen; i++ {
+		valueBase += string(rune('0' + i%10))
+	}
+
+	// Create 5 attributes with different suffixes to ensure uniqueness
+	attrs := []attribute.KeyValue{
+		attribute.String(keyBase+"1", valueBase+"x"),
+		attribute.String(keyBase+"2", valueBase+"y"),
+		attribute.String(keyBase+"3", valueBase+"z"),
+		attribute.String(keyBase+"4", valueBase+"w"),
+		attribute.String(keyBase+"5", valueBase+"v"),
+	}
+	return attrs
+}
+
+func BenchmarkNewSetStringAttrs(b *testing.B) {
+	testCases := []struct {
+		name     string
+		keyLen   int
+		valueLen int
+	}{
+		{"SmallStrings", 2, 1},        // B1="2"
+		{"MediumStrings", 10, 10},     // realistic service names, etc.
+		{"LargeStrings", 25, 25},      // longer service names, URLs, etc.
+		{"VeryLargeStrings", 50, 100}, // very long values like URLs, descriptions
+		{"HugeStrings", 100, 500},     // extremely large like full URLs, JSON, etc.
+	}
+
+	for _, tc := range testCases {
+		b.Run(tc.name, func(b *testing.B) {
+			attrs := generateStringAttrsWithSize(tc.keyLen, tc.valueLen)
+			b.ReportAllocs()
+			b.ResetTimer()
+			for b.Loop() {
+				attribute.NewSet(attrs...)
+			}
+		})
 	}
 }