我已经完成了 Go 之旅,现在正在学习 Colly 的一些教程。我了解最大深度,并一直在尝试在 go 程序中实现它,如下所示:
package main
import (
"encoding/json"
"log"
"net/http"
"github.com/gocolly/colly"
)
func ping(w http.ResponseWriter, r *http.Request) {
log.Println("Ping")
w.Write([]byte("ping"))
}
func getData(w http.ResponseWriter, r *http.Request) {
//Verify the param "URL" exists
URL := r.URL.Query().Get("url")
if URL == "" {
log.Println("missing URL argument")
return
}
log.Println("visiting", URL)
//Create a new collector which will be in charge of collect the data from HTML
c := colly.NewCollector(
// MaxDepth is 2, so only the links on the scraped page
// and links on those pages are visited
colly.MaxDepth(2),
colly.Async(true),
)
// Limit the maximum parallelism to 2
// This is necessary if the goroutines are dynamically
// created to control the limit of simultaneous requests.
//
// Parallelism can be controlled also by spawning fixed
// number of go routines.
c.Limit(&colly.LimitRule{DomainGlob: "*", Parallelism: 2})
//Slices to store the data
var response []string
//onHTML function allows the collector to use a callback function when the specific HTML tag is reached
//in this case whenever our collector finds an
//anchor tag with href it will call the anonymous function
// specified below which will get the info from the href and append it to our slice
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
link := e.Request.AbsoluteURL(e.Attr("href"))
if link != "" {
response = append(response, link)
}
})
//Command to visit the website
c.Visit(URL)
// parse our response slice into JSON format
b, err := json.Marshal(response)
if err != nil {
log.Println("failed to serialize response:", err)
return
}
// Add some header and write the body for our endpoint
w.Header().Add("Content-Type", "application/json")
w.Write(b)
}
func main() {
addr := ":7171"
http.HandleFunc("/links", getData)
http.HandleFunc("/ping", ping)
log.Println("listening on", addr)
log.Fatal(http.ListenAndServe(addr, nil))
}
这样做时,响应为空。取出 MaxDepth 和 Async 行会产生预期的响应(只有顶级链接)。
任何帮助表示赞赏!