我是新来的,所以一粒盐,但在我看来,这个解决方案似乎更惯用。它对所有结果使用单个通道,对所有抓取请求(尝试抓取特定 url)使用单个通道,以及用于跟踪完成的等待组。主 Crawl 调用充当对工作进程的抓取请求的分发者(同时处理重复数据删除),并充当有多少抓取请求待处理的跟踪器。
package main
import (
"fmt"
"sync"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
type FetchResult struct {
url string
body string
err error
}
type CrawlRequest struct {
url string
depth int
}
type Crawler struct {
depth int
fetcher Fetcher
results chan FetchResult
crawlRequests chan CrawlRequest
urlReservations map[string]bool
waitGroup *sync.WaitGroup
}
func (crawler Crawler) Crawl(url string, depth int) {
defer crawler.waitGroup.Done()
if depth <= 0 {
return
}
body, urls, err := crawler.fetcher.Fetch(url)
crawler.results <- FetchResult{url, body, err}
if len(urls) == 0 {
return
}
crawler.waitGroup.Add(len(urls))
for _, url := range urls {
crawler.crawlRequests <- CrawlRequest{url, depth - 1}
}
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) (results chan FetchResult) {
results = make(chan FetchResult)
urlReservations := make(map[string]bool)
crawler := Crawler{
crawlRequests: make(chan CrawlRequest),
depth: depth,
fetcher: fetcher,
results: results,
waitGroup: &sync.WaitGroup{},
}
crawler.waitGroup.Add(1)
// Listen for crawlRequests, pass them through to the caller if they aren't duplicates.
go func() {
for crawlRequest := range crawler.crawlRequests {
if _, isReserved := urlReservations[crawlRequest.url]; isReserved {
crawler.waitGroup.Done()
continue
}
urlReservations[crawlRequest.url] = true
go crawler.Crawl(crawlRequest.url, crawlRequest.depth)
}
}()
// Wait for the wait group to finish, and then close the channel
go func() {
crawler.waitGroup.Wait()
close(results)
}()
// Send the first crawl request to the channel
crawler.crawlRequests <- CrawlRequest{url, depth}
return
}
func main() {
results := Crawl("https://golang.org/", 4, fetcher)
for result := range results {
if result.err != nil {
fmt.Println(result.err)
continue
}
fmt.Printf("found: %s %q\n", result.url, result.body)
}
fmt.Printf("done!")
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"https://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"https://golang.org/pkg/",
"https://golang.org/cmd/",
},
},
"https://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"https://golang.org/",
"https://golang.org/cmd/",
"https://golang.org/pkg/fmt/",
"https://golang.org/pkg/os/",
},
},
"https://golang.org/pkg/fmt/": &fakeResult{
"Package fmt",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
"https://golang.org/pkg/os/": &fakeResult{
"Package os",
[]string{
"https://golang.org/",
"https://golang.org/pkg/",
},
},
}