Performance Weirdness

I’ve been working on learning Go and wrote a program to compute the Barnsley fern fractal. A large part of this computation involves generating random numbers for the random iterations needed to draw the fern.

As a test I coded the simple middle-square Weyl-sequence random number generator in Go to see whether it made my program faster or slower. From the exact same source codes the results were

  • The main Go compiler (version 1.16.3) produced an executable that ran 2.288 times slower.

  • The GNU Go compiler (version 10.2.1) produced an executable that ran 5 percent faster.

Note also that the main Go compiler was already producing an executable which ran 2 times slower than GNU Go with my original code but then started producing an executable which ran 4 times slower after I added the custom random number generator.

I’m posting here, because I’m astonished at the results and was wondering if anyone wanted to look into what’s going on. Since it’s possible the team tuning the compiler has been using a set of micro-benchmarks that is too small to ensure good performance, I’ll include the two versions of my code here for reference.

The original version:

/*  fernpar.go -- Compute the Barnsley Fern */

package main

import ("fmt"; "os"; "bufio"; "runtime"; "time"; "math/rand")

const N=800000000

var tictime float64
func tic() {
    now:=time.Now()
    tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
    now:=time.Now()
    return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}

var A=[4][2][2]float64{
    {{0.85, 0.04}, {-0.04, 0.85}},
    {{0.20, -0.26}, {0.23, 0.22}},
    {{-0.15, 0.28}, {0.26, 0.24}},
    {{0.00, 0.00}, {0.00, 0.16}}}

var B=[4][2]float64{
    {0.00, 1.60},
    {0.00, 1.60},
    {0.00, 0.44},
    {0.00, 0.00}}

var P=[4]float64{0.85, 0.07, 0.07, 0.01 }
var cdf [4]float64

func sum(p []float64)float64 {
    r:=0.0
    for i:=0; i<len(p); i++ { r+=p[i] }
    return r
}

func f(i int, x [2]float64) [2]float64 {
    var b [2]float64
    for j:=0; j<2; j++ {
        b[j]=B[i][j]
        for k:=0; k<2; k++ {
            b[j]+=A[i][j][k]*x[k]
        }
    }
    return b
}

func i(p float64) int {
    for j:=0; j<3; j++ {
        if p<=cdf[j] {
            return j
        }
    }
    return 3
}

var xmin=[2]float64{-2.1820,0}
var xmax=[2]float64{2.6558,9.9983}
const border=0.1

const scale=617.0
var image [][]byte

func doinit() {
    for i:=0; i<4; i++ {
        cdf[i]=sum(P[0:i+1])
    }
    var nu [2]int
    for i:=0; i<2; i++ {
        nu[i]=int(scale*(xmax[i]-xmin[i]+2*border))
    }
    image=make([][]byte,nu[1])
    for i:=0; i<nu[1]; i++ {
        image[i]=make([]byte,nu[0])
    }
}

func plot() uint64 {
    count:=uint64(0)
    io,_:=os.Create("fern.pnm")
    fp:=bufio.NewWriter(io)
    fmt.Fprintf(fp,"P4\n")
    fmt.Fprintf(fp,"%d %d\n",len(image[0]),len(image))
    row:=make([]byte,(len(image[0])+7)/8)
    for iy:=len(image)-1; iy>=0; iy-- {
        rx:=0; rb:=byte(0)
        ib:=byte(128)
        for ix:=0; ix<len(image[0]); ix++ {
            if image[iy][ix]!=0 {
                rb|=ib; count++
            }
            ib>>=1
            if ib==0 {
                row[rx]=rb; ib=128
                rb=0; rx++
            }
        }
        if ib!=0 { row[rx]=rb }
        fp.Write(row)
    }
    fp.Flush()
    io.Close()
    return count
}

func point(x [2]float64) {
    var coord [2]int
    for i:=0; i<2; i++ {
        coord[i]=int(scale*(x[i]-xmin[i]+border))
    }
    image[coord[1]][coord[0]]=1
}

func work(s uint64,jmax int,c chan int){
    gen:=rand.New(rand.NewSource(int64(s)))
    var xn=[2]float64{0.0,0.0}
    point(xn)
    for j:=0; j<jmax; j++ {
        xn=f(i(gen.Float64()),xn)
        point(xn)
    }
    c<-0
}

func main(){
    tic()
    ncpu:=runtime.GOMAXPROCS(0)
    doinit()
    fmt.Printf("fernpar.go -- Compute Barnsley's Fern"+
        " (GOMAXPROCS=%d)\n",ncpu)
    fmt.Printf("\nResolution: %d x %d\nIterations: %d\n",
        len(image[0]),len(image),N)
    ret:=make(chan int,ncpu)
    for n:=1;n<ncpu;n++ {
        go work(rand.Uint64(),N/ncpu,ret)
    }
    work(rand.Uint64(),N/ncpu,ret)
    for n:=0;n<ncpu;n++ { <-ret }
    fmt.Printf("Blk Pixels: %d\n",plot())
    tsec:=toc()
    fmt.Printf("\nIteration rate is %g per second.\n",N/tsec)
    fmt.Printf("Total execution time %g seconds.\n",tsec)
    os.Exit(0)
}

The modified version:

/*  fernweyl.go -- Compute the Barnsley Fern

    Modified to use the middle-square Weyl-sequence random number
    generator described in https://arxiv.org/abs/1704.00358
 */

package main

import ("fmt"; "os"; "bufio"; "runtime"; "time")

const N=800000000

var tictime float64
func tic() {
    now:=time.Now()
    tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
    now:=time.Now()
    return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}

var A=[4][2][2]float64{
    {{0.85, 0.04}, {-0.04, 0.85}},
    {{0.20, -0.26}, {0.23, 0.22}},
    {{-0.15, 0.28}, {0.26, 0.24}},
    {{0.00, 0.00}, {0.00, 0.16}}}

var B=[4][2]float64{
    {0.00, 1.60},
    {0.00, 1.60},
    {0.00, 0.44},
    {0.00, 0.00}}

var P=[4]float64{0.85, 0.07, 0.07, 0.01 }
var cdf [4]float64

func sum(p []float64)float64 {
    r:=0.0
    for i:=0; i<len(p); i++ { r+=p[i] }
    return r
}

func f(i int, x [2]float64) [2]float64 {
    var b [2]float64
    for j:=0; j<2; j++ {
        b[j]=B[i][j]
        for k:=0; k<2; k++ {
            b[j]+=A[i][j][k]*x[k]
        }
    }
    return b
}

func i(p float64) int {
    for j:=0; j<3; j++ {
        if p<=cdf[j] {
            return j
        }
    }
    return 3
}

var xmin=[2]float64{-2.1820,0}
var xmax=[2]float64{2.6558,9.9983}
const border=0.1

const scale=617.0
var image [][]byte

func doinit() {
    for i:=0; i<4; i++ {
        cdf[i]=sum(P[0:i+1])
    }
    var nu [2]int
    for i:=0; i<2; i++ {
        nu[i]=int(scale*(xmax[i]-xmin[i]+2*border))
    }
    image=make([][]byte,nu[1])
    for i:=0; i<nu[1]; i++ {
        image[i]=make([]byte,nu[0])
    }
}

func plot() uint64 {
    count:=uint64(0)
    io,_:=os.Create("fern.pnm")
    fp:=bufio.NewWriter(io)
    fmt.Fprintf(fp,"P4\n")
    fmt.Fprintf(fp,"%d %d\n",len(image[0]),len(image))
    row:=make([]byte,(len(image[0])+7)/8)
    for iy:=len(image)-1; iy>=0; iy-- {
        rx:=0; rb:=byte(0)
        ib:=byte(128)
        for ix:=0; ix<len(image[0]); ix++ {
            if image[iy][ix]!=0 { 
                rb|=ib; count++
            }
            ib>>=1
            if ib==0 {
                row[rx]=rb; ib=128
                rb=0; rx++
            }
        }
        if ib!=0 { row[rx]=rb }
        fp.Write(row)
    }
    fp.Flush()
    io.Close()
    return count
}

func point(x [2]float64) {
    var coord [2]int
    for i:=0; i<2; i++ {
        coord[i]=int(scale*(x[i]-xmin[i]+border))
    }
    image[coord[1]][coord[0]]=1
}

type rstate struct {
    x,w,s uint64
}
func rint32(p *rstate) uint32 {
    (*p).x*=(*p).x; (*p).w+=(*p).s
    (*p).x+=(*p).w; (*p).x=((*p).x>>32)|((*p).x<<32)
    return uint32((*p).x)
}
func rint64(p *rstate) uint64 {
    r:=uint64(rint32(p))<<32
    return r|uint64(rint32(p))
}
func rfloat(p *rstate) float64 {
    return float64(rint32(p))/(1<<32)
}
var gs=rstate{0,0,0xb5ad4eceda1ce2a9}
func rseed() rstate {
    var p rstate
    p.x=rint64(&gs); p.w=rint64(&gs)
    p.s=rint64(&gs)|1
    return p
}

func work(p *rstate,jmax int,c chan int){
    var xn=[2]float64{0.0,0.0}
    point(xn)
    for j:=0; j<jmax; j++ {
        xn=f(i(rfloat(p)),xn)
        point(xn)
    }
    c<-0
}

func main(){
    tic()
    ncpu:=runtime.GOMAXPROCS(0)
    doinit()
    fmt.Printf("fernweyl.go -- Compute Barnsley's Fern"+
        " (GOMAXPROCS=%d)\n",ncpu)
    fmt.Printf("\nResolution: %d x %d\nIterations: %d\n",
        len(image[0]),len(image),N)
    ret:=make(chan int,ncpu)
    for n:=1;n<ncpu;n++ {
        p:=rseed()
        go work(&p,N/ncpu,ret)
    }
    p:=rseed()
    work(&p,N/ncpu,ret)
    for n:=0;n<ncpu;n++ { <-ret }
    fmt.Printf("Blk Pixels: %d\n",plot())
    tsec:=toc()
    fmt.Printf("\nIteration rate is %g per second.\n",N/tsec)
    fmt.Printf("Total execution time %g seconds.\n",tsec)
    os.Exit(0)
}

Any insight into why the modified code speeds things up on GNU Go while slowing things down on the main Go compiler would be great.

time.Now() is quite slow.
Why not use the math/rand package to generate random numbers?

While it makes sense to use the go test -bench framework when tuning the compiler, including a single call within the program to get the wall time at the beginning and end provides a sanity check of total system performance and allows comparison to other programming languages. Given that the runtime varies from 3.7 seconds with gccgo to about 15 seconds using go build, the overhead of two calls to the system clock would appear insignificant.

You will notice that the original code did use math/rand. The reason to switch to the middle-square Weyl-sequence random number generator was to create a repeatable calculation that can be used to compare different programming languages. For example, the default random number generator in Go is very different than the one used in Julia. Therefore, to make a meaningful comparison of the quality of code generated by the JIT used in Julia to the fast Go compiler one needs to include an identical random number generator as part of the program itself. Note that Julia runs the same computation in 3.9 seconds (on the same hardware) which makes it much closer to gccgo than the standard go build tool chain.

The fact that the middle-square Weyl-sequence random number generator is exceedingly simple and claims cryptographic security could be useful in some applications, but obviously not for generating fern fractals. However, from a performance point of view, it should be pointed out that using the exact same random sequence is important for this particular calculation because the pattern of assignments

image[coord[1]][coord[0]]=1

in the point(x) function is sensitive to hardware-level cache contention and invalidation. In particular, the same pattern of writes need to occur in any implementation of the program for a meaningful comparison between compiler versions, programming languages and computing hardware.

While the fact that go build creates binaries that run slower than gccgo is not surprising, the factor-four performance difference in favor of gccgo for the present code seems too much. I’ve compared the assembly output between gccgo and go compile to see whether either accidentally inserted any thread locking into the code to prevent data races but did not notice anything.

Does anyone know why the performance difference between the executable produced by go build is four times slower than gccgo? Is this the expected performance difference?

I tidied up the code a bit to use methods for the random number generator and remove writing the explicit pointer dereferences that cluttered the visual aspects of the program.

/*  fernfast.go -- Compute the Barnsley Fern

    Modified to use the middle-square Weyl-sequence random number
    generator described in https://arxiv.org/abs/1704.00358
 */

package main

import ("fmt"; "os"; "bufio"; "runtime"; "time")

const N=800000000

var tictime float64
func tic() {
    now:=time.Now()
    tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
    now:=time.Now()
    return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}

var A=[4][2][2]float64{
    {{0.85, 0.04}, {-0.04, 0.85}},
    {{0.20, -0.26}, {0.23, 0.22}},
    {{-0.15, 0.28}, {0.26, 0.24}},
    {{0.00, 0.00}, {0.00, 0.16}}}

var B=[4][2]float64{
    {0.00, 1.60},
    {0.00, 1.60},
    {0.00, 0.44},
    {0.00, 0.00}}

var P=[4]float64{0.85, 0.07, 0.07, 0.01 }
var cdf [3]uint32

func sum(p []float64)float64 {
    r:=0.0
    for i:=0; i<len(p); i++ { r+=p[i] }
    return r
}

func f(i int, x [2]float64) [2]float64 {
    var b [2]float64
    for j:=0; j<2; j++ {
        b[j]=B[i][j]
        for k:=0; k<2; k++ {
            b[j]+=A[i][j][k]*x[k]
        }
    }
    return b
}

func i(p uint32) int {
    for j:=0; j<3; j++ {
        if p<cdf[j] {
            return j
        }
    }
    return 3
}

var xmin=[2]float64{-2.1820,0}
var xmax=[2]float64{2.6558,9.9983}
const border=0.1

const scale=617.0
var image [][]byte

func doinit() {
    for i:=0; i<3; i++ {
        cdf[i]=uint32(sum(P[0:i+1])*(1<<32))
    }
    var nu [2]int
    for i:=0; i<2; i++ {
        nu[i]=int(scale*(xmax[i]-xmin[i]+2*border))
    }
    image=make([][]byte,nu[1])
    for i:=0; i<nu[1]; i++ {
        image[i]=make([]byte,nu[0])
    }
}

func plot() uint64 {
    count:=uint64(0)
    io,_:=os.Create("fern.pnm")
    fp:=bufio.NewWriter(io)
    fmt.Fprintf(fp,"P4\n")
    fmt.Fprintf(fp,"%d %d\n",len(image[0]),len(image))
    row:=make([]byte,(len(image[0])+7)/8)
    for iy:=len(image)-1; iy>=0; iy-- {
        rx:=0; rb:=byte(0)
        ib:=byte(128)
        for ix:=0; ix<len(image[0]); ix++ {
            if image[iy][ix]!=0 { 
                rb|=ib; count++
            }
            ib>>=1
            if ib==0 {
                row[rx]=rb; ib=128
                rb=0; rx++
            }
        }
        if ib!=0 { row[rx]=rb }
        fp.Write(row)
    }
    fp.Flush()
    io.Close()
    return count
}

func point(x [2]float64) {
    var coord [2]int
    for i:=0; i<2; i++ {
        coord[i]=int(scale*(x[i]-xmin[i]+border))
    }
    image[coord[1]][coord[0]]=1
}

type rstate struct {
    x,w,s uint64
}
func (p *rstate)rint32() uint32 {
    p.x*=p.x; p.w+=p.s
    p.x+=p.w; p.x=(p.x>>32)|(p.x<<32)
    return uint32(p.x)
}
func (p *rstate)rint64() uint64 {
    r:=uint64(p.rint32())<<32
    return r|uint64(p.rint32())
}
var gs=rstate{0,0,0xb5ad4eceda1ce2a9}
func rseed() rstate {
    var p rstate
    p.x=gs.rint64(); p.w=gs.rint64()
    p.s=gs.rint64()|1
    return p
}

func work(p *rstate,jmax int,c chan int){
    var xn=[2]float64{0.0,0.0}
    point(xn)
    for j:=0; j<jmax; j++ {
        xn=f(i(p.rint32()),xn)
        point(xn)
    }
    c<-0
}

func main(){
    tic()
    ncpu:=runtime.GOMAXPROCS(0)
    doinit()
    fmt.Printf("fernfast.go -- Compute Barnsley's Fern"+
        " (GOMAXPROCS=%d)\n",ncpu)
    fmt.Printf("\nResolution: %d x %d\nIterations: %d\n",
        len(image[0]),len(image),N)
    ret:=make(chan int,ncpu)
    for n:=1;n<ncpu;n++ {
        p:=rseed()
        go work(&p,N/ncpu,ret)
    }
    p:=rseed()
    work(&p,N/ncpu,ret)
    for n:=0;n<ncpu;n++ { <-ret }
    fmt.Printf("Blk Pixels: %d\n",plot())
    tsec:=toc()
    fmt.Printf("\nIteration rate is %g per second.\n",N/tsec)
    fmt.Printf("Total execution time %g seconds.\n",tsec)
    os.Exit(0)
}

Except for a minor improvement that comes from eliminating one of the floating-point operations per iteration, the execution speed of the cleaned up code is the same as before. In particular it’s still the case that gccgo is producing an x86 binary that runs 3 to 4 times faster than the go build compiler.

I decided to perform the same test on the Raspberry Pi 4B running in 64-bit ARMv8 mode. Much to my surprise, gccgo did not produce a significantly faster executable in this case.

$ ./fernfast # using go build version 1.16.3 on ARMv8
fernfast.go -- Compute Barnsley's Fern (GOMAXPROCS=4)

Resolution: 3108 x 6292
Iterations: 800000000
Blk Pixels: 4826441

Iteration rate is 2.6407432347526528e+07 per second.
Total execution time 30.294501543045044 seconds.
$ ./fernfast-gcc # using gccgo version 10.3.0 on ARMv8
fernfast.go -- Compute Barnsley's Fern (GOMAXPROCS=4)

Resolution: 3108 x 6292
Iterations: 800000000
Blk Pixels: 4826441

Iteration rate is 2.7498114230911233e+07 per second.
Total execution time 29.092904090881348 seconds.

On the other hand, the Ryzen 1700 produces

$ ./fernfast.gcc # using gccgo version 10.2.1 on x86
fernfast.go -- Compute Barnsley's Fern (GOMAXPROCS=16)

Resolution: 3108 x 6292
Iterations: 800000000
Blk Pixels: 4826397

Iteration rate is 2.2060141084731704e+08 per second.
Total execution time 3.6264500617980957 seconds.
$ ./fernfast # using go build version 1.16.3 on x86
fernfast.go -- Compute Barnsley's Fern (GOMAXPROCS=16)

Resolution: 3108 x 6292
Iterations: 800000000
Blk Pixels: 4826397

Iteration rate is 5.5024985423174635e+07 per second.
Total execution time 14.538849830627441 seconds.

Something seems strange with the x86 code generator in go build for this particular program. Does anyone know how to diagnose this? Am I posting in the correct forum?

Sounds like instruction vectorization might be in play. Have you checked the generated assembly?

I just looked more carefully at the assembler output from gccgo. With -O3 the compiler places the call to the random number generator inline as well as the calls to f and the indexing function. Surprisingly, the call to the function which records the pixel in the shared array is not inline. At any rate, the floating point uses scalar SSE2 mulsd and addsd instructions. I did not see any AVX or AVX512 instructions. Note that the integer operations used to implement the random number generator also seemed pretty standard.

When I turned off optimization in gccgo using -O0, the resulting performance was about the same as go build. I haven’t looked at the assembler output from the go compile command as carefully.

Is it possible go build failed to inline both f and the random number generator? Is there a way to inline code more aggressively with go build?

I have created objdump -S output for both the go build and gccgo executables. As most of the execution time is spent in the work routine

func work(p *rstate,jmax int,c chan int){
    var xn=[2]float64{0.0,0.0}
    point(xn)
    for j:=0; j<jmax; j++ {
        xn=f(i(p.rint32()),xn)
        point(xn)
    }
    c<-0
}

I’ll post the generated x86 instructions here for comparison. Note that go build inlines the point() function as well as f() and i() while gccgo only inlines the latter two.

Here is the gccgo output

00000000000002a0 <main.work>:
    return p
}

func work(p *rstate,jmax int,c chan int){
 2a0:	64 48 3b 24 25 70 00 	cmp    %fs:0x70,%rsp
 2a7:	00 00 
 2a9:	0f 82 63 01 00 00    	jb     412 <main.work+0x172>
 2af:	41 57                	push   %r15
 2b1:	49 89 ff             	mov    %rdi,%r15
 2b4:	41 56                	push   %r14
 2b6:	41 55                	push   %r13
 2b8:	41 54                	push   %r12
 2ba:	55                   	push   %rbp
 2bb:	53                   	push   %rbx
 2bc:	48 89 f3             	mov    %rsi,%rbx
 2bf:	48 83 ec 38          	sub    $0x38,%rsp
 2c3:	48 89 54 24 18       	mov    %rdx,0x18(%rsp)
 2c8:	64 48 8b 04 25 28 00 	mov    %fs:0x28,%rax
 2cf:	00 00 
 2d1:	48 89 44 24 28       	mov    %rax,0x28(%rsp)
 2d6:	31 c0                	xor    %eax,%eax
    var xn=[2]float64{0.0,0.0}
    point(xn)
 2d8:	66 48 0f 6e c8       	movq   %rax,%xmm1
 2dd:	66 48 0f 6e c0       	movq   %rax,%xmm0
 2e2:	e8 09 ff ff ff       	callq  1f0 <main.point>
    for j:=0; j<jmax; j++ {
 2e7:	48 85 db             	test   %rbx,%rbx
 2ea:	0f 8e da 00 00 00    	jle    3ca <main.work+0x12a>
    var xn=[2]float64{0.0,0.0}
 2f0:	66 0f ef c9          	pxor   %xmm1,%xmm1
    for j:=0; j<jmax; j++ {
 2f4:	45 31 f6             	xor    %r14d,%r14d
 2f7:	4c 8d 2d 00 00 00 00 	lea    0x0(%rip),%r13        # 2fe <main.work+0x5e>
    var xn=[2]float64{0.0,0.0}
 2fe:	66 0f 28 c1          	movapd %xmm1,%xmm0
 302:	4c 8d 25 00 00 00 00 	lea    0x0(%rip),%r12        # 309 <main.work+0x69>
 309:	48 8d 2d 00 00 00 00 	lea    0x0(%rip),%rbp        # 310 <main.work+0x70>
 310:	eb 0a                	jmp    31c <main.work+0x7c>
 312:	66 0f 1f 44 00 00    	nopw   0x0(%rax,%rax,1)
        xn=f(i(p.rint32()),xn)
 318:	66 0f 28 c2          	movapd %xmm2,%xmm0
    p.x*=p.x; p.w+=p.s
 31c:	49 8b 07             	mov    (%r15),%rax
 31f:	49 8b 4f 10          	mov    0x10(%r15),%rcx
 323:	49 03 4f 08          	add    0x8(%r15),%rcx
 327:	48 0f af c0          	imul   %rax,%rax
 32b:	49 89 4f 08          	mov    %rcx,0x8(%r15)
    p.x+=p.w; p.x=(p.x>>32)|(p.x<<32)
 32f:	48 01 c8             	add    %rcx,%rax
 332:	48 c1 c0 20          	rol    $0x20,%rax
 336:	49 89 07             	mov    %rax,(%r15)
        if p<cdf[j] {
 339:	41 3b 45 00          	cmp    0x0(%r13),%eax
 33d:	0f 82 be 00 00 00    	jb     401 <main.work+0x161>
 343:	3b 05 00 00 00 00    	cmp    0x0(%rip),%eax        # 349 <main.work+0xa9>
 349:	0f 82 b9 00 00 00    	jb     408 <main.work+0x168>
    for j:=0; j<3; j++ {
 34f:	39 05 00 00 00 00    	cmp    %eax,0x0(%rip)        # 355 <main.work+0xb5>
 355:	0f 96 c0             	setbe  %al
 358:	0f b6 c0             	movzbl %al,%eax
 35b:	48 83 c0 02          	add    $0x2,%rax
        b[j]=B[i][j]
 35f:	48 89 c1             	mov    %rax,%rcx
            b[j]+=A[i][j][k]*x[k]
 362:	48 c1 e0 05          	shl    $0x5,%rax
    for j:=0; j<jmax; j++ {
 366:	49 83 c6 01          	add    $0x1,%r14
            b[j]+=A[i][j][k]*x[k]
 36a:	48 01 e8             	add    %rbp,%rax
        b[j]=B[i][j]
 36d:	48 c1 e1 04          	shl    $0x4,%rcx
            b[j]+=A[i][j][k]*x[k]
 371:	f2 0f 10 58 08       	movsd  0x8(%rax),%xmm3
 376:	f2 0f 10 10          	movsd  (%rax),%xmm2
        b[j]=B[i][j]
 37a:	4c 01 e1             	add    %r12,%rcx
            b[j]+=A[i][j][k]*x[k]
 37d:	f2 0f 59 d9          	mulsd  %xmm1,%xmm3
 381:	f2 0f 59 d0          	mulsd  %xmm0,%xmm2
 385:	f2 0f 59 48 18       	mulsd  0x18(%rax),%xmm1
 38a:	f2 0f 58 11          	addsd  (%rcx),%xmm2
 38e:	f2 0f 59 40 10       	mulsd  0x10(%rax),%xmm0
 393:	f2 0f 58 41 08       	addsd  0x8(%rcx),%xmm0
 398:	f2 0f 58 d3          	addsd  %xmm3,%xmm2
 39c:	f2 0f 58 c8          	addsd  %xmm0,%xmm1
        point(xn)
 3a0:	66 0f 28 c2          	movapd %xmm2,%xmm0
 3a4:	f2 0f 11 54 24 10    	movsd  %xmm2,0x10(%rsp)
 3aa:	f2 0f 11 4c 24 08    	movsd  %xmm1,0x8(%rsp)
 3b0:	e8 3b fe ff ff       	callq  1f0 <main.point>
    for j:=0; j<jmax; j++ {
 3b5:	4c 39 f3             	cmp    %r14,%rbx
 3b8:	f2 0f 10 4c 24 08    	movsd  0x8(%rsp),%xmm1
 3be:	f2 0f 10 54 24 10    	movsd  0x10(%rsp),%xmm2
 3c4:	0f 85 4e ff ff ff    	jne    318 <main.work+0x78>
    }
    c<-0
 3ca:	48 8b 7c 24 18       	mov    0x18(%rsp),%rdi
 3cf:	48 8d 74 24 20       	lea    0x20(%rsp),%rsi
 3d4:	48 c7 44 24 20 00 00 	movq   $0x0,0x20(%rsp)
 3db:	00 00 
 3dd:	e8 00 00 00 00       	callq  3e2 <main.work+0x142>
func work(p *rstate,jmax int,c chan int){
 3e2:	48 8b 44 24 28       	mov    0x28(%rsp),%rax
 3e7:	64 48 2b 04 25 28 00 	sub    %fs:0x28,%rax
 3ee:	00 00 
 3f0:	75 34                	jne    426 <main.work+0x186>
 3f2:	48 83 c4 38          	add    $0x38,%rsp
 3f6:	5b                   	pop    %rbx
 3f7:	5d                   	pop    %rbp
 3f8:	41 5c                	pop    %r12
 3fa:	41 5d                	pop    %r13
 3fc:	41 5e                	pop    %r14
 3fe:	41 5f                	pop    %r15
 400:	c3                   	retq   
    for j:=0; j<3; j++ {
 401:	31 c0                	xor    %eax,%eax
 403:	e9 57 ff ff ff       	jmpq   35f <main.work+0xbf>
 408:	b8 01 00 00 00       	mov    $0x1,%eax
 40d:	e9 4d ff ff ff       	jmpq   35f <main.work+0xbf>
func work(p *rstate,jmax int,c chan int){
 412:	41 ba 68 00 00 00    	mov    $0x68,%r10d
 418:	45 31 db             	xor    %r11d,%r11d
 41b:	e8 00 00 00 00       	callq  420 <main.work+0x180>
 420:	c3                   	retq   
 421:	e9 89 fe ff ff       	jmpq   2af <main.work+0xf>
 426:	e8 00 00 00 00       	callq  42b <main.main..thunk0>

while the go build output looks like

000000000049a920 <main.work>:
}

func work(p *rstate,jmax int,c chan int){
  49a920:	64 48 8b 0c 25 f8 ff 	mov    %fs:0xfffffffffffffff8,%rcx
  49a927:	ff ff 
  49a929:	48 8d 44 24 f8       	lea    -0x8(%rsp),%rax
  49a92e:	48 3b 41 10          	cmp    0x10(%rcx),%rax
  49a932:	0f 86 a8 02 00 00    	jbe    49abe0 <main.work+0x2c0>
  49a938:	48 81 ec 88 00 00 00 	sub    $0x88,%rsp
  49a93f:	48 89 ac 24 80 00 00 	mov    %rbp,0x80(%rsp)
  49a946:	00 
  49a947:	48 8d ac 24 80 00 00 	lea    0x80(%rsp),%rbp
  49a94e:	00 
    var xn=[2]float64{0.0,0.0}
  49a94f:	0f 57 c0             	xorps  %xmm0,%xmm0
  49a952:	0f 11 44 24 10       	movups %xmm0,0x10(%rsp)
    point(xn)
  49a957:	0f 11 44 24 40       	movups %xmm0,0x40(%rsp)
    var coord [2]int
  49a95c:	0f 11 44 24 60       	movups %xmm0,0x60(%rsp)
  49a961:	31 c0                	xor    %eax,%eax
    for i:=0; i<2; i++ {
  49a963:	eb 3b                	jmp    49a9a0 <main.work+0x80>
        coord[i]=int(scale*(x[i]-xmin[i]+border))
  49a965:	f2 0f 10 4c c4 40    	movsd  0x40(%rsp,%rax,8),%xmm1
  49a96b:	48 8d 15 2e 1c 0a 00 	lea    0xa1c2e(%rip),%rdx        # 53c5a0 <main.xmin>
  49a972:	f2 0f 5c 0c c2       	subsd  (%rdx,%rax,8),%xmm1
  49a977:	f2 0f 10 15 31 2e 04 	movsd  0x42e31(%rip),%xmm2        # 4dd7b0 <$f64.3fb999999999999a>
  49a97e:	00 
  49a97f:	f2 0f 58 ca          	addsd  %xmm2,%xmm1
  49a983:	f2 0f 10 1d a5 2e 04 	movsd  0x42ea5(%rip),%xmm3        # 4dd830 <$f64.4083480000000000>
  49a98a:	00 
  49a98b:	f2 0f 59 cb          	mulsd  %xmm3,%xmm1
  49a98f:	f2 48 0f 2c d9       	cvttsd2si %xmm1,%rbx
  49a994:	48 89 5c c4 60       	mov    %rbx,0x60(%rsp,%rax,8)
    for i:=0; i<2; i++ {
  49a999:	48 ff c0             	inc    %rax
  49a99c:	0f 1f 40 00          	nopl   0x0(%rax)
  49a9a0:	48 83 f8 02          	cmp    $0x2,%rax
  49a9a4:	7c bf                	jl     49a965 <main.work+0x45>
    image[coord[1]][coord[0]]=1
  49a9a6:	48 8b 15 43 78 0b 00 	mov    0xb7843(%rip),%rdx        # 5521f0 <main.image>
  49a9ad:	48 8b 0d 44 78 0b 00 	mov    0xb7844(%rip),%rcx        # 5521f8 <main.image+0x8>
  49a9b4:	48 8b 44 24 68       	mov    0x68(%rsp),%rax
  49a9b9:	0f 1f 80 00 00 00 00 	nopl   0x0(%rax)
  49a9c0:	48 39 c8             	cmp    %rcx,%rax
  49a9c3:	0f 83 10 02 00 00    	jae    49abd9 <main.work+0x2b9>
  49a9c9:	48 8d 1c 40          	lea    (%rax,%rax,2),%rbx
  49a9cd:	48 8b 4c da 08       	mov    0x8(%rdx,%rbx,8),%rcx
  49a9d2:	48 8b 14 da          	mov    (%rdx,%rbx,8),%rdx
  49a9d6:	48 8b 44 24 60       	mov    0x60(%rsp),%rax
  49a9db:	0f 1f 44 00 00       	nopl   0x0(%rax,%rax,1)
  49a9e0:	48 39 c8             	cmp    %rcx,%rax
  49a9e3:	0f 83 eb 01 00 00    	jae    49abd4 <main.work+0x2b4>
  49a9e9:	c6 04 02 01          	movb   $0x1,(%rdx,%rax,1)
    point(xn)
  49a9ed:	48 8b 94 24 98 00 00 	mov    0x98(%rsp),%rdx
  49a9f4:	00 
  49a9f5:	48 8b 9c 24 90 00 00 	mov    0x90(%rsp),%rbx
  49a9fc:	00 
  49a9fd:	31 c0                	xor    %eax,%eax
  49a9ff:	90                   	nop
  49aa00:	e9 48 01 00 00       	jmpq   49ab4d <main.work+0x22d>
    for j:=0; j<3; j++ {
  49aa05:	48 ff c1             	inc    %rcx
  49aa08:	48 83 f9 03          	cmp    $0x3,%rcx
  49aa0c:	0f 8d 67 01 00 00    	jge    49ab79 <main.work+0x259>
        if p<cdf[j] {
  49aa12:	48 8d 3d d7 4d 0e 00 	lea    0xe4dd7(%rip),%rdi        # 57f7f0 <main.cdf>
  49aa19:	44 8b 04 8f          	mov    (%rdi,%rcx,4),%r8d
  49aa1d:	0f 1f 00             	nopl   (%rax)
  49aa20:	41 39 f0             	cmp    %esi,%r8d
  49aa23:	76 e0                	jbe    49aa05 <main.work+0xe5>
    for j:=0; j<jmax; j++ {
        xn=f(i(p.rint32()),xn)
  49aa25:	0f 10 4c 24 10       	movups 0x10(%rsp),%xmm1
  49aa2a:	0f 11 4c 24 20       	movups %xmm1,0x20(%rsp)
    var b [2]float64
  49aa2f:	0f 11 44 24 70       	movups %xmm0,0x70(%rsp)
  49aa34:	31 f6                	xor    %esi,%esi
    for j:=0; j<2; j++ {
  49aa36:	eb 4b                	jmp    49aa83 <main.work+0x163>
            b[j]+=A[i][j][k]*x[k]
  49aa38:	4d 89 c2             	mov    %r8,%r10
  49aa3b:	49 c1 e0 05          	shl    $0x5,%r8
  49aa3f:	4c 8d 1d 9a 46 0a 00 	lea    0xa469a(%rip),%r11        # 53f0e0 <main.A>
  49aa46:	4f 8d 24 03          	lea    (%r11,%r8,1),%r12
  49aa4a:	49 89 f5             	mov    %rsi,%r13
  49aa4d:	48 c1 e6 04          	shl    $0x4,%rsi
  49aa51:	49 01 f4             	add    %rsi,%r12
  49aa54:	f2 41 0f 10 0c cc    	movsd  (%r12,%rcx,8),%xmm1
  49aa5a:	f2 0f 59 4c cc 20    	mulsd  0x20(%rsp,%rcx,8),%xmm1
  49aa60:	f2 42 0f 58 4c ec 70 	addsd  0x70(%rsp,%r13,8),%xmm1
  49aa67:	f2 42 0f 11 4c ec 70 	movsd  %xmm1,0x70(%rsp,%r13,8)
        for k:=0; k<2; k++ {
  49aa6e:	48 ff c1             	inc    %rcx
            b[j]+=A[i][j][k]*x[k]
  49aa71:	4c 89 ee             	mov    %r13,%rsi
  49aa74:	4d 89 d0             	mov    %r10,%r8
        for k:=0; k<2; k++ {
  49aa77:	48 83 f9 02          	cmp    $0x2,%rcx
  49aa7b:	7c bb                	jl     49aa38 <main.work+0x118>
    for j:=0; j<2; j++ {
  49aa7d:	48 ff c6             	inc    %rsi
        b[j]=B[i][j]
  49aa80:	4c 89 c1             	mov    %r8,%rcx
    for j:=0; j<2; j++ {
  49aa83:	48 83 fe 02          	cmp    $0x2,%rsi
  49aa87:	7d 2c                	jge    49aab5 <main.work+0x195>
        b[j]=B[i][j]
  49aa89:	48 83 f9 04          	cmp    $0x4,%rcx
  49aa8d:	0f 83 34 01 00 00    	jae    49abc7 <main.work+0x2a7>
  49aa93:	49 89 c8             	mov    %rcx,%r8
  49aa96:	48 c1 e1 04          	shl    $0x4,%rcx
  49aa9a:	4c 8d 0d 5f 35 0a 00 	lea    0xa355f(%rip),%r9        # 53e000 <main.B>
  49aaa1:	4d 8d 14 09          	lea    (%r9,%rcx,1),%r10
  49aaa5:	f2 41 0f 10 0c f2    	movsd  (%r10,%rsi,8),%xmm1
  49aaab:	f2 0f 11 4c f4 70    	movsd  %xmm1,0x70(%rsp,%rsi,8)
  49aab1:	31 c9                	xor    %ecx,%ecx
        for k:=0; k<2; k++ {
  49aab3:	eb c2                	jmp    49aa77 <main.work+0x157>
        xn=f(i(p.rint32()),xn)
  49aab5:	0f 10 4c 24 70       	movups 0x70(%rsp),%xmm1
  49aaba:	0f 11 4c 24 10       	movups %xmm1,0x10(%rsp)
        point(xn)
  49aabf:	0f 10 4c 24 70       	movups 0x70(%rsp),%xmm1
  49aac4:	0f 11 4c 24 30       	movups %xmm1,0x30(%rsp)
    var coord [2]int
  49aac9:	0f 11 44 24 50       	movups %xmm0,0x50(%rsp)
  49aace:	31 c9                	xor    %ecx,%ecx
    for i:=0; i<2; i++ {
  49aad0:	eb 37                	jmp    49ab09 <main.work+0x1e9>
        coord[i]=int(scale*(x[i]-xmin[i]+border))
  49aad2:	f2 0f 10 4c cc 30    	movsd  0x30(%rsp,%rcx,8),%xmm1
  49aad8:	48 8d 35 c1 1a 0a 00 	lea    0xa1ac1(%rip),%rsi        # 53c5a0 <main.xmin>
  49aadf:	f2 0f 5c 0c ce       	subsd  (%rsi,%rcx,8),%xmm1
  49aae4:	f2 0f 10 15 c4 2c 04 	movsd  0x42cc4(%rip),%xmm2        # 4dd7b0 <$f64.3fb999999999999a>
  49aaeb:	00 
  49aaec:	f2 0f 58 ca          	addsd  %xmm2,%xmm1
  49aaf0:	f2 0f 10 1d 38 2d 04 	movsd  0x42d38(%rip),%xmm3        # 4dd830 <$f64.4083480000000000>
  49aaf7:	00 
  49aaf8:	f2 0f 59 cb          	mulsd  %xmm3,%xmm1
  49aafc:	f2 4c 0f 2c c1       	cvttsd2si %xmm1,%r8
  49ab01:	4c 89 44 cc 50       	mov    %r8,0x50(%rsp,%rcx,8)
    for i:=0; i<2; i++ {
  49ab06:	48 ff c1             	inc    %rcx
  49ab09:	48 83 f9 02          	cmp    $0x2,%rcx
  49ab0d:	7c c3                	jl     49aad2 <main.work+0x1b2>
    image[coord[1]][coord[0]]=1
  49ab0f:	48 8b 35 da 76 0b 00 	mov    0xb76da(%rip),%rsi        # 5521f0 <main.image>
  49ab16:	48 8b 0d db 76 0b 00 	mov    0xb76db(%rip),%rcx        # 5521f8 <main.image+0x8>
  49ab1d:	4c 8b 44 24 58       	mov    0x58(%rsp),%r8
  49ab22:	49 39 c8             	cmp    %rcx,%r8
  49ab25:	0f 83 94 00 00 00    	jae    49abbf <main.work+0x29f>
  49ab2b:	4f 8d 04 40          	lea    (%r8,%r8,2),%r8
  49ab2f:	4a 8b 4c c6 08       	mov    0x8(%rsi,%r8,8),%rcx
  49ab34:	4a 8b 34 c6          	mov    (%rsi,%r8,8),%rsi
  49ab38:	4c 8b 44 24 50       	mov    0x50(%rsp),%r8
  49ab3d:	0f 1f 00             	nopl   (%rax)
  49ab40:	4c 39 c1             	cmp    %r8,%rcx
  49ab43:	76 72                	jbe    49abb7 <main.work+0x297>
  49ab45:	42 c6 04 06 01       	movb   $0x1,(%rsi,%r8,1)
    for j:=0; j<jmax; j++ {
  49ab4a:	48 ff c0             	inc    %rax
  49ab4d:	48 39 c2             	cmp    %rax,%rdx
  49ab50:	7e 38                	jle    49ab8a <main.work+0x26a>
    p.x*=p.x; p.w+=p.s
  49ab52:	48 8b 33             	mov    (%rbx),%rsi
  49ab55:	48 0f af f6          	imul   %rsi,%rsi
  49ab59:	48 89 33             	mov    %rsi,(%rbx)
  49ab5c:	48 8b 7b 08          	mov    0x8(%rbx),%rdi
  49ab60:	48 03 7b 10          	add    0x10(%rbx),%rdi
  49ab64:	48 89 7b 08          	mov    %rdi,0x8(%rbx)
    p.x+=p.w; p.x=(p.x>>32)|(p.x<<32)
  49ab68:	48 01 fe             	add    %rdi,%rsi
  49ab6b:	48 c1 c6 20          	rol    $0x20,%rsi
  49ab6f:	48 89 33             	mov    %rsi,(%rbx)
        xn=f(i(p.rint32()),xn)
  49ab72:	31 c9                	xor    %ecx,%ecx
    for j:=0; j<3; j++ {
  49ab74:	e9 8f fe ff ff       	jmpq   49aa08 <main.work+0xe8>
  49ab79:	48 8d 3d 70 4c 0e 00 	lea    0xe4c70(%rip),%rdi        # 57f7f0 <main.cdf>
  49ab80:	b9 03 00 00 00       	mov    $0x3,%ecx
        xn=f(i(p.rint32()),xn)
  49ab85:	e9 9b fe ff ff       	jmpq   49aa25 <main.work+0x105>
    }
    c<-0
  49ab8a:	48 8b 84 24 a0 00 00 	mov    0xa0(%rsp),%rax
  49ab91:	00 
  49ab92:	48 89 04 24          	mov    %rax,(%rsp)
  49ab96:	48 8d 05 f3 2c 04 00 	lea    0x42cf3(%rip),%rax        # 4dd890 <$f64.bfe62e42fefa39ef+0x10>
  49ab9d:	48 89 44 24 08       	mov    %rax,0x8(%rsp)
  49aba2:	e8 f9 9e f6 ff       	callq  404aa0 <runtime.chansend1>
}
  49aba7:	48 8b ac 24 80 00 00 	mov    0x80(%rsp),%rbp
  49abae:	00 
  49abaf:	48 81 c4 88 00 00 00 	add    $0x88,%rsp
  49abb6:	c3                   	retq   
    image[coord[1]][coord[0]]=1
  49abb7:	4c 89 c0             	mov    %r8,%rax
  49abba:	e8 a1 9a fc ff       	callq  464660 <runtime.panicIndex>
  49abbf:	4c 89 c0             	mov    %r8,%rax
  49abc2:	e8 99 9a fc ff       	callq  464660 <runtime.panicIndex>
        b[j]=B[i][j]
  49abc7:	48 89 c8             	mov    %rcx,%rax
  49abca:	b9 04 00 00 00       	mov    $0x4,%ecx
  49abcf:	e8 8c 9a fc ff       	callq  464660 <runtime.panicIndex>
    image[coord[1]][coord[0]]=1
  49abd4:	e8 87 9a fc ff       	callq  464660 <runtime.panicIndex>
  49abd9:	e8 82 9a fc ff       	callq  464660 <runtime.panicIndex>
  49abde:	90                   	nop
func work(p *rstate,jmax int,c chan int){
  49abdf:	90                   	nop
  49abe0:	e8 3b 7a fc ff       	callq  462620 <runtime.morestack_noctxt>
  49abe5:	e9 36 fd ff ff       	jmpq   49a920 <main.work>
  49abea:	cc                   	int3   
  49abeb:	cc                   	int3   
  49abec:	cc                   	int3   
  49abed:	cc                   	int3   
  49abee:	cc                   	int3   
  49abef:	cc                   	int3   
  49abf0:	cc                   	int3   
  49abf1:	cc                   	int3   
  49abf2:	cc                   	int3   
  49abf3:	cc                   	int3   
  49abf4:	cc                   	int3   
  49abf5:	cc                   	int3   
  49abf6:	cc                   	int3   
  49abf7:	cc                   	int3   
  49abf8:	cc                   	int3   
  49abf9:	cc                   	int3   
  49abfa:	cc                   	int3   
  49abfb:	cc                   	int3   
  49abfc:	cc                   	int3   
  49abfd:	cc                   	int3   
  49abfe:	cc                   	int3   
  49abff:	cc                   	int3   

I think the go build version is more verbose because of inlining point(). Unfortunately, I’m not experienced enough to determine why it runs 4 times slower than the gccgo executable.

Does anyone see anything wrong with the way I have written my Go code or the assembler output itself? Could there be undefined behavior that is resolved differently by each compiler? Is this program something that should be added to the benchmark suite so that the compiler developers can think about why the go build version runs so much slower? What’s wrong?