Goroutines and Web-Crawler

Farkhad_Rakhimzhanov · April 9, 2018, 4:11pm

Below example will fetch 1st url and program will exit, because I run goroutine go crawl()
How can I wait until crawler finishes its work (except for fmt.Scanln() in the end)?

func Crawl(url string, depth int) {
 if depth <= 0 {
  return 
 }

 urls, err := Fetch(url)
 if err != nil  {
  fmt.Println(err)
  return
 }
 fmt.Println("Found urls in ", url)

 for _, u := urls {  
  go Crawl(u, depth-1)
 }

 return
}

func Fetch(url string) []string, error {
 // ... fetching/processing data from url
 if urls, ok := data[url]; ok {
  return urls, nil
 } 
 return nil, fmt.Errorf("error %s", url) 
}

func main() {
 Crawl("golang.org", 4)
}

PS
this is exercise from A Tour of Go: Exercise: Web Crawler

lutzhorn · April 9, 2018, 6:33pm

Use a sync.WaitGroup. Take a look at the example.

Farkhad_Rakhimzhanov · April 10, 2018, 9:16am

In addition to “lutzhorn” advice & solutions at stackoverflow, I’ve finally got it working!

/*
https://tour.golang.org/concurrency/10
Exercise: Web Crawler
In this exercise you'll use Go's concurrency features
to parallelize a web crawler.
Modify the Crawl function to fetch URLs in parallel
without fetching the same URL twice.
Hint: you can keep a cache of the URLs that have been fetched on a map,
but maps alone are not safe for concurrent use!
*/
package main

import (
	"fmt"
	"sync"
)

type Fetcher interface {
	// Fetch returns the body of URL and
	// a slice of URLs found on that page.
	Fetch(url string) (body string, urls []string, err error)

	// IsProcessed returns true if URL has been already fetched & processed
	IsProcessed(url string) bool
}

//var ch chan string = make(chan string)
//var quit chan bool = make(chan bool)
var wg sync.WaitGroup // A WaitGroup waits for a collection of goroutines to finish
var urlCache map[string]bool = make(map[string]bool)
var mux sync.Mutex

// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
	defer wg.Done()
	if fetcher.IsProcessed(url) {
		return
	}
	// TODO: Fetch URLs in parallel.
	// TODO: Don't fetch the same URL twice.
	// This implementation doesn't do either:
	if depth <= 0 {
		return
	}

	body, urls, err := fetcher.Fetch(url)
	if err != nil {
		fmt.Println(err)
		return
	}
	fmt.Printf("found: %s %q\n", url, body)

	for _, u := range urls {
		wg.Add(1)
		go Crawl(u, depth-1, fetcher)
	}
	return
}

func main() {
	wg.Add(1)
	go Crawl("https://golang.org/", 4, fetcher)
	wg.Wait()
}

// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult

type fakeResult struct {
	body string
	urls []string
}

func (f fakeFetcher) IsProcessed(url string) bool {
	mux.Lock()
	defer mux.Unlock()
	if !urlCache[url] {
		urlCache[url] = true
		return false
	}
	return true
}

func (f fakeFetcher) Fetch(url string) (string, []string, error) {
	if res, ok := f[url]; ok {
		return res.body, res.urls, nil
	}
	return "", nil, fmt.Errorf("not found: %s", url)
}

// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
	"https://golang.org/": &fakeResult{
		"The Go Programming Language",
		[]string{
			"https://golang.org/pkg/",
			"https://golang.org/cmd/",
		},
	},
	"https://golang.org/pkg/": &fakeResult{
		"Packages",
		[]string{
			"https://golang.org/",
			"https://golang.org/cmd/",
			"https://golang.org/pkg/fmt/",
			"https://golang.org/pkg/os/",
		},
	},
	"https://golang.org/pkg/fmt/": &fakeResult{
		"Package fmt",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
	"https://golang.org/pkg/os/": &fakeResult{
		"Package os",
		[]string{
			"https://golang.org/",
			"https://golang.org/pkg/",
		},
	},
}