Here is a benchmark like:
package main
import (
"sync"
"sync/atomic"
)
type AtomicCounter struct {
A int64
B int64
C int64
D int64
}
type MutexCounter struct {
A int64
B int64
C int64
D int64
mu sync.Mutex
}
// Atomic Increment
func IncrementAtomic(c *AtomicCounter, delta int64) {
atomic.AddInt64(&c.A, delta)
atomic.AddInt64(&c.B, delta)
atomic.AddInt64(&c.C, delta)
atomic.AddInt64(&c.D, delta)
}
// Mutex Increment
func IncrementMutex(c *MutexCounter, delta int64) {
c.mu.Lock()
defer c.mu.Unlock()
c.A += delta
c.B += delta
c.C += delta
c.D += delta
}
// Benchmark for Atomic Increment
func BenchmarkIncrementAtomic(b *testing.B) {
counter := &AtomicCounter{}
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
IncrementAtomic(counter, 1)
}
})
}
// Benchmark for Mutex Increment
func BenchmarkIncrementMutex(b *testing.B) {
counter := &MutexCounter{}
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
IncrementMutex(counter, 1)
}
})
}
the results show lock performance is better:
goos: darwin
goarch: arm64
pkg: t
cpu: Apple M2 Max
BenchmarkIncrementAtomic
BenchmarkIncrementAtomic-2 41731690 25.47 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-2 50620629 25.21 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-2 45867760 25.26 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-4 23364030 51.48 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-4 23443662 51.09 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-4 24704637 50.02 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-8 14516994 77.24 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-8 15323640 70.95 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-8 16269632 74.57 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-12 9426187 127.9 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-12 9647442 118.3 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-12 9670734 116.5 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-16 9547399 127.9 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-16 9255408 129.1 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-16 9288032 123.9 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-1024 10477359 127.6 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-1024 9347586 123.1 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-1024 9970914 104.3 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-2048 12710971 120.5 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-2048 17835810 111.5 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementAtomic-2048 12470020 117.6 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex
BenchmarkIncrementMutex-2 28628401 82.97 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-2 29020346 82.75 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-2 29471953 85.90 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-4 12342710 96.54 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-4 12148924 95.72 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-4 12092522 96.49 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-8 14736597 79.07 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-8 14910420 77.23 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-8 15160263 77.65 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-12 15025918 78.02 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-12 15581076 76.57 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-12 15080223 77.05 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-16 15165836 78.86 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-16 15863791 77.28 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-16 15384672 75.76 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-1024 13133365 84.49 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-1024 13068876 83.09 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-1024 13828316 83.57 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-2048 12515894 82.34 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-2048 12778197 83.96 ns/op 0 B/op 0 allocs/op
BenchmarkIncrementMutex-2048 12766641 82.67 ns/op 0 B/op 0 allocs/op
PASS
The performance of 4(even 2) atomic operations is already inferior to that of a single lock. In scenarios with high concurrency, what choices should we make?