Colly callback not receiving variable from go func

Basically, it’s like this:

x := "It worked earlyer lol"
go func(x string){
  c.onHTML(...){
    print(x)
  }
  c.visit(https://www.site.com)
}

so that’s the gist when I print the variables that are blank but outside of the go func they are defined here’s the full parent function:

eventCollector.OnHTML(".rgMasterTable tr", func(h *colly.HTMLElement) {
    eventName := h.ChildText("td:nth-child(3) a")
    eventURL := h.ChildAttr("td:nth-child(3) a", "href")
    state := h.ChildText("td:nth-child(2)")
    wgFR.Add(1)             // Increment WaitGroup counter for each goroutine
    semaphore <- struct{}{} // Acquire a token

    go func(eventName, eventURL, state string) {
       defer wgFR.Done() // Signal completion when the goroutine exits
       defer func() { <-semaphore }()
       contestCollector := eventCollector.Clone()
       var postedDateStr string
       contestCollector.OnHTML("#ctl00_ContentPlaceHolder1_FormView1_Report_2Label", func(d *colly.HTMLElement) {
          postedDateStr = d.Text
       })

       contestCollector.OnHTML(".rgMasterTable tr", func(c *colly.HTMLElement) { // troubled line

          contestName := c.ChildText("td:nth-child(1)")
          contestURL := c.ChildAttr("td:nth-child(3) a", "href")
          if contestURL == "" {
             contestURL = "FORMAT-ERROR"
          } // tmp handler for document style results
          postedDate, timeErr := time.Parse("Jan 2, 2006", postedDateStr)
          if timeErr != nil {
             log.Printf("Error parsing time from %s", eventURL)
          }
          contest := Contest{
             EventName:   eventName,
             ContestName: contestName,
             PostedDate:  postedDate,
             ContestURL:  contestURL,
             State:       state,
             Present:     true,
          }
          fResults = append(fResults, contest)
       })
       err := contestCollector.Visit("https://www.judgingcard.com/Results/" + eventURL)
       if err != nil {
          log.Printf("Could not find event: %s -- %s", eventURL, eventName)
       }
       contestCollector.Wait() // Wait for the inner collector to finish
    }(eventName, eventURL, state)
})

in the full version all variables passed to the go func return blank if in the callback (contestCollector.OnHTML) unfortunately I’m not sure weather issue likes in the go routine, the fact that is as callback or whatever else.
Thanks in advance!

This seems pretty convoluted. It looks like you are using github.com/gocolly/colly. Why is that section in a goroutine exactly? I would first try getting rid of that. It seems as though the collectors are already using goroutines (I am just guessing since they have a Wait function; I didn’t look at the docs to confirm this).

This code looks like it’s missing some stuff (like where are you waiting for wgFR? Where is wgFR even defined?) so off the top of my head one something could be modifying the strings in your outer functions that you might not be anticipating. In the problematic contestCollector.OnHTML callback you are not passing those values to that function so it could be falling prey to something like this:

func main() {
	myAwesomeValue := "awesome"
	wg := sync.WaitGroup{}
	wg.Add(1)
	go func() {
		time.Sleep(time.Millisecond)
		fmt.Println("The value is", myAwesomeValue)
		wg.Done()
	}()
	myAwesomeValue = "not awesome"
	wg.Wait()
}

… which prints The value is not awesome.

1 Like

Thank you the Colly framework does have its own manager that dosen’t require defining a new wait group. Ive built this test script using what Ive learned that works as expected.

package main

import (
	"fmt"
	"github.com/gocolly/colly"
)

type Event struct {
	event_date   string
	state        string
	event_name   string
	event_url    string
	contest_name string
	contest_url  string
}

func main() {
	var events []Event

	ECollector := colly.NewCollector(colly.Async(true))
	ECollector.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})

	ECollector.OnHTML(".rgMasterTable tr", func(e *colly.HTMLElement) {
		if e.Index > 5 {
			return
		}

		date := e.ChildText("td:nth-child(1)")
		state := e.ChildText("td:nth-child(2)")
		name := e.ChildText("td:nth-child(3)")
		url := e.ChildAttr("td:nth-child(3) a", "href")

		event := Event{
			event_date: date,
			state:      state,
			event_name: name,
			event_url:  url,
		}
		events = append(events, event)
	})

	ECollector.Visit("https://www.judgingcard.com")
	ECollector.Wait()

	CCollector := ECollector.Clone()

	for _, event := range events {
		CCollector.OnHTML(".rgMasterTable tr", func(c *colly.HTMLElement) {
			//skip header line
			if c.Index == 0 || c.Request.URL.String() == "https://www.judgingcard.com/Results/default.aspx" {
				return
			}
			//More contests than events/build new struct
			contest_name := c.ChildText("td:nth-child(1)")
			contest_url := c.ChildAttr("td:nth-child(3) a", "href")
			
			fmt.Println(contest_name, contest_url)
		})
		CCollector.Visit("https://www.judgingcard.com" + event.event_url)
	}
	CCollector.Wait()
}