How is GC & inlining influencing this code?

package main

import (
    "fmt"
    "math"
    "os"
    "sync"
    "text/tabwriter"
    "time"
)

func producer(wg *sync.WaitGroup, l sync.Locker) {
    defer wg.Done()

    for i := 0; i < 15; i++ {
        l.Lock()
        l.Unlock()

        time.Sleep(1)
    }
}

func observer(wg *sync.WaitGroup, l sync.Locker) {
    defer wg.Done()
    l.Lock()
    defer l.Unlock()
}

func test(count int, mutext, rwMutex sync.Locker) time.Duration {
    var wg sync.WaitGroup
    wg.Add(count + 1)

    begin := time.Now()

    go producer(&wg, mutext)
    for i := 0; i < count; i++ {
        go observer(&wg, rwMutex)
    }

    wg.Wait()

    return time.Since(begin)
}

func main() {
    tw := tabwriter.NewWriter(os.Stdout, 0, 1, 2, ' ', 0)
    defer tw.Flush()

    var m sync.RWMutex
    fmt.Fprintf(tw, "Readers\tRWMutext\tMutex\n")

    for i := 0; i < 18; i++ {
        count := int(math.Pow(2, float64(i)))

        fmt.Fprintf(tw, "%d\t%v\t%v\n", count, test(count, &m, m.RLocker()),
            test(count, &m, &m))
    }
}

This produces:
Readers RWMutext Mutex
1 30.173µs 33.075µs
2 6.991µs 67.438µs
4 31.798µs 46.757µs
8 15.977µs 7.22µs
16 99.907µs 38.771µs
32 57.922µs 42.454µs
64 203.127µs 86.813µs
128 98.211µs 140.335µs
256 149.461µs 225.813µs
512 107.206µs 130.003µs
1024 292.564µs 234.575µs
2048 467.862µs 509.994µs
4096 1.090972ms 914.308µs
8192 1.992554ms 1.798584ms
16384 4.314152ms 3.566635ms
32768 8.154836ms 7.483657ms
65536 16.807987ms 14.413483ms
131072 35.100699ms 28.784819ms

look at the last instruction:

fmt.Fprintf(tw, "%d\t%v\t%v\n", count, test(count, &m, m.RLocker()), test(count, &m, &m))

If I change the last instruction to:

fmt.Fprintf(tw, "%d\t%v\t%v\n", count, test(count, &m, &m), test(count, &m, m.RLocker()))

I then get:
1 52.005µs 12.136µs
2 10.268µs 6.304µs
4 8.222µs 41.576µs
8 15.647µs 7.521µs
16 48.071µs 60.577µs
32 33.093µs 34.517µs
64 63.459µs 85.228µs
128 110.46µs 108.612µs
256 110.729µs 89.459µs
512 177.386µs 170.756µs
1024 275.454µs 245.833µs
2048 559.328µs 493.242µs
4096 1.029273ms 1.024745ms
8192 1.784677ms 2.066452ms
16384 3.640868ms 4.232521ms
32768 7.27125ms 8.054228ms
65536 14.409217ms 16.423354ms
131072 27.935498ms 32.925838ms

Clearly RWMutext should produce better results (aquire the lock for less time). Unfortunately, due to what I assume to be {compile optimizations, GC, inlining} the second test() invocation has an advantage. No matter if I invoke it with RWMutext or just Mutext.

Why is this ?

For the more critical minded, here is what happens when I change that line to:

fmt.Fprintf(tw, "%d\t%v\t%v\n", count, test(count, &m, m.RLocker()), test(count, &m, m.RLocker()))

Output is:

1 75.217µs 45.262µs
2 20.555µs 5.608µs
4 23.769µs 33.54µs
8 11.955µs 31.59µs
16 34.788µs 20.266µs
32 180.366µs 111.972µs
64 72.018µs 94.648µs
128 212.692µs 263.827µs
256 114.372µs 76.824µs
512 252.35µs 218.792µs
1024 316.893µs 300.179µs
2048 582.167µs 552.773µs
4096 1.026949ms 1.024502ms
8192 2.043287ms 2.010938ms
16384 4.281553ms 4.092734ms
32768 8.212327ms 8.322347ms
65536 16.556263ms 16.702198ms
131072 33.212763ms 33.016844ms