I am using colly for a data scraping job and since I have multiple sites to scrape, I want to reuse some of the settings. I have created a package called client containing the following code
package client
import (
"bufio"
"fmt"
"github.com/gocolly/colly"
"log"
"math/rand"
"os"
"time"
)
var userAgents []string
func readUserAgents() []string {
path, err := os.Getwd()
if err != nil {
log.Fatal(err)
}
file, err := os.Open(fmt.Sprintf("%s/client/agents.txt", path))
if err != nil {
log.Fatal(err)
}
defer file.Close()
var agents []string
scanner := bufio.NewScanner(file)
for scanner.Scan() {
agents = append(agents, scanner.Text())
}
if err := scanner.Err(); err != nil {
log.Fatal(err)
}
return agents
}
var c *colly.Collector
func init() {
// Seed constantly-changing number
rand.Seed(time.Now().UnixNano())
userAgents = readUserAgents()
}
func NewClient() *colly.Collector {
c = colly.NewCollector(
colly.MaxDepth(2),
)
c.Limit(&colly.LimitRule{
RandomDelay: 10 * time.Second,
Parallelism: 2,
})
c.OnRequest(func(r *colly.Request) {
r.Headers.Set("User-Agent", userAgents[rand.Intn(len(userAgents))])
})
c.OnError(func(_ *colly.Response, err error) {
log.Fatal(err)
})
c.OnScraped(func(r *colly.Response) {
log.Println(fmt.Sprintf("Done %s", r.Request.URL))
})
return c
}
Now when using this client I can call NewClient() and it returns *colly.NewCollector() with some boilerplate setup for setting some headers and error handling.
But I still have to do a lot of boilerplate when using this client package, pr site I have to set the AllowedDomains, DomainGlob and potentially other configuration that is specific.
So my question is, what is the go-way to do this? Would it be creating a package where I store all the configurations only and not a “wrapper-like” approach? So basically a bunch of constants?