I’ve been working on learning Go and wrote a program to compute the Barnsley fern fractal. A large part of this computation involves generating random numbers for the random iterations needed to draw the fern.
As a test I coded the simple middle-square Weyl-sequence random number generator in Go to see whether it made my program faster or slower. From the exact same source codes the results were
-
The main Go compiler (version 1.16.3) produced an executable that ran 2.288 times slower.
-
The GNU Go compiler (version 10.2.1) produced an executable that ran 5 percent faster.
Note also that the main Go compiler was already producing an executable which ran 2 times slower than GNU Go with my original code but then started producing an executable which ran 4 times slower after I added the custom random number generator.
I’m posting here, because I’m astonished at the results and was wondering if anyone wanted to look into what’s going on. Since it’s possible the team tuning the compiler has been using a set of micro-benchmarks that is too small to ensure good performance, I’ll include the two versions of my code here for reference.
The original version:
/* fernpar.go -- Compute the Barnsley Fern */
package main
import ("fmt"; "os"; "bufio"; "runtime"; "time"; "math/rand")
const N=800000000
var tictime float64
func tic() {
now:=time.Now()
tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
now:=time.Now()
return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}
var A=[4][2][2]float64{
{{0.85, 0.04}, {-0.04, 0.85}},
{{0.20, -0.26}, {0.23, 0.22}},
{{-0.15, 0.28}, {0.26, 0.24}},
{{0.00, 0.00}, {0.00, 0.16}}}
var B=[4][2]float64{
{0.00, 1.60},
{0.00, 1.60},
{0.00, 0.44},
{0.00, 0.00}}
var P=[4]float64{0.85, 0.07, 0.07, 0.01 }
var cdf [4]float64
func sum(p []float64)float64 {
r:=0.0
for i:=0; i<len(p); i++ { r+=p[i] }
return r
}
func f(i int, x [2]float64) [2]float64 {
var b [2]float64
for j:=0; j<2; j++ {
b[j]=B[i][j]
for k:=0; k<2; k++ {
b[j]+=A[i][j][k]*x[k]
}
}
return b
}
func i(p float64) int {
for j:=0; j<3; j++ {
if p<=cdf[j] {
return j
}
}
return 3
}
var xmin=[2]float64{-2.1820,0}
var xmax=[2]float64{2.6558,9.9983}
const border=0.1
const scale=617.0
var image [][]byte
func doinit() {
for i:=0; i<4; i++ {
cdf[i]=sum(P[0:i+1])
}
var nu [2]int
for i:=0; i<2; i++ {
nu[i]=int(scale*(xmax[i]-xmin[i]+2*border))
}
image=make([][]byte,nu[1])
for i:=0; i<nu[1]; i++ {
image[i]=make([]byte,nu[0])
}
}
func plot() uint64 {
count:=uint64(0)
io,_:=os.Create("fern.pnm")
fp:=bufio.NewWriter(io)
fmt.Fprintf(fp,"P4\n")
fmt.Fprintf(fp,"%d %d\n",len(image[0]),len(image))
row:=make([]byte,(len(image[0])+7)/8)
for iy:=len(image)-1; iy>=0; iy-- {
rx:=0; rb:=byte(0)
ib:=byte(128)
for ix:=0; ix<len(image[0]); ix++ {
if image[iy][ix]!=0 {
rb|=ib; count++
}
ib>>=1
if ib==0 {
row[rx]=rb; ib=128
rb=0; rx++
}
}
if ib!=0 { row[rx]=rb }
fp.Write(row)
}
fp.Flush()
io.Close()
return count
}
func point(x [2]float64) {
var coord [2]int
for i:=0; i<2; i++ {
coord[i]=int(scale*(x[i]-xmin[i]+border))
}
image[coord[1]][coord[0]]=1
}
func work(s uint64,jmax int,c chan int){
gen:=rand.New(rand.NewSource(int64(s)))
var xn=[2]float64{0.0,0.0}
point(xn)
for j:=0; j<jmax; j++ {
xn=f(i(gen.Float64()),xn)
point(xn)
}
c<-0
}
func main(){
tic()
ncpu:=runtime.GOMAXPROCS(0)
doinit()
fmt.Printf("fernpar.go -- Compute Barnsley's Fern"+
" (GOMAXPROCS=%d)\n",ncpu)
fmt.Printf("\nResolution: %d x %d\nIterations: %d\n",
len(image[0]),len(image),N)
ret:=make(chan int,ncpu)
for n:=1;n<ncpu;n++ {
go work(rand.Uint64(),N/ncpu,ret)
}
work(rand.Uint64(),N/ncpu,ret)
for n:=0;n<ncpu;n++ { <-ret }
fmt.Printf("Blk Pixels: %d\n",plot())
tsec:=toc()
fmt.Printf("\nIteration rate is %g per second.\n",N/tsec)
fmt.Printf("Total execution time %g seconds.\n",tsec)
os.Exit(0)
}
The modified version:
/* fernweyl.go -- Compute the Barnsley Fern
Modified to use the middle-square Weyl-sequence random number
generator described in https://arxiv.org/abs/1704.00358
*/
package main
import ("fmt"; "os"; "bufio"; "runtime"; "time")
const N=800000000
var tictime float64
func tic() {
now:=time.Now()
tictime=float64(now.Unix())+1.0E-9*float64(now.Nanosecond())
}
func toc() float64 {
now:=time.Now()
return float64(now.Unix())+1.0E-9*float64(now.Nanosecond())-tictime
}
var A=[4][2][2]float64{
{{0.85, 0.04}, {-0.04, 0.85}},
{{0.20, -0.26}, {0.23, 0.22}},
{{-0.15, 0.28}, {0.26, 0.24}},
{{0.00, 0.00}, {0.00, 0.16}}}
var B=[4][2]float64{
{0.00, 1.60},
{0.00, 1.60},
{0.00, 0.44},
{0.00, 0.00}}
var P=[4]float64{0.85, 0.07, 0.07, 0.01 }
var cdf [4]float64
func sum(p []float64)float64 {
r:=0.0
for i:=0; i<len(p); i++ { r+=p[i] }
return r
}
func f(i int, x [2]float64) [2]float64 {
var b [2]float64
for j:=0; j<2; j++ {
b[j]=B[i][j]
for k:=0; k<2; k++ {
b[j]+=A[i][j][k]*x[k]
}
}
return b
}
func i(p float64) int {
for j:=0; j<3; j++ {
if p<=cdf[j] {
return j
}
}
return 3
}
var xmin=[2]float64{-2.1820,0}
var xmax=[2]float64{2.6558,9.9983}
const border=0.1
const scale=617.0
var image [][]byte
func doinit() {
for i:=0; i<4; i++ {
cdf[i]=sum(P[0:i+1])
}
var nu [2]int
for i:=0; i<2; i++ {
nu[i]=int(scale*(xmax[i]-xmin[i]+2*border))
}
image=make([][]byte,nu[1])
for i:=0; i<nu[1]; i++ {
image[i]=make([]byte,nu[0])
}
}
func plot() uint64 {
count:=uint64(0)
io,_:=os.Create("fern.pnm")
fp:=bufio.NewWriter(io)
fmt.Fprintf(fp,"P4\n")
fmt.Fprintf(fp,"%d %d\n",len(image[0]),len(image))
row:=make([]byte,(len(image[0])+7)/8)
for iy:=len(image)-1; iy>=0; iy-- {
rx:=0; rb:=byte(0)
ib:=byte(128)
for ix:=0; ix<len(image[0]); ix++ {
if image[iy][ix]!=0 {
rb|=ib; count++
}
ib>>=1
if ib==0 {
row[rx]=rb; ib=128
rb=0; rx++
}
}
if ib!=0 { row[rx]=rb }
fp.Write(row)
}
fp.Flush()
io.Close()
return count
}
func point(x [2]float64) {
var coord [2]int
for i:=0; i<2; i++ {
coord[i]=int(scale*(x[i]-xmin[i]+border))
}
image[coord[1]][coord[0]]=1
}
type rstate struct {
x,w,s uint64
}
func rint32(p *rstate) uint32 {
(*p).x*=(*p).x; (*p).w+=(*p).s
(*p).x+=(*p).w; (*p).x=((*p).x>>32)|((*p).x<<32)
return uint32((*p).x)
}
func rint64(p *rstate) uint64 {
r:=uint64(rint32(p))<<32
return r|uint64(rint32(p))
}
func rfloat(p *rstate) float64 {
return float64(rint32(p))/(1<<32)
}
var gs=rstate{0,0,0xb5ad4eceda1ce2a9}
func rseed() rstate {
var p rstate
p.x=rint64(&gs); p.w=rint64(&gs)
p.s=rint64(&gs)|1
return p
}
func work(p *rstate,jmax int,c chan int){
var xn=[2]float64{0.0,0.0}
point(xn)
for j:=0; j<jmax; j++ {
xn=f(i(rfloat(p)),xn)
point(xn)
}
c<-0
}
func main(){
tic()
ncpu:=runtime.GOMAXPROCS(0)
doinit()
fmt.Printf("fernweyl.go -- Compute Barnsley's Fern"+
" (GOMAXPROCS=%d)\n",ncpu)
fmt.Printf("\nResolution: %d x %d\nIterations: %d\n",
len(image[0]),len(image),N)
ret:=make(chan int,ncpu)
for n:=1;n<ncpu;n++ {
p:=rseed()
go work(&p,N/ncpu,ret)
}
p:=rseed()
work(&p,N/ncpu,ret)
for n:=0;n<ncpu;n++ { <-ret }
fmt.Printf("Blk Pixels: %d\n",plot())
tsec:=toc()
fmt.Printf("\nIteration rate is %g per second.\n",N/tsec)
fmt.Printf("Total execution time %g seconds.\n",tsec)
os.Exit(0)
}
Any insight into why the modified code speeds things up on GNU Go while slowing things down on the main Go compiler would be great.