我有这个爬虫库,如果第一个用户代理返回错误,我想更改我的用户代理,但是这段代码不起作用,如果第一个用户代理不起作用,我已经发送了第二次尝试,但这永远不会完成,因为 onHTML 不是触发:
package scraper
import (
"net/http"
"github.com/davecgh/go-spew/spew"
"github.com/gocolly/colly"
)
const (
fbUserAgent = "ua 1"
userAgent = "ua 2"
)
type ScrapeResult struct {
Title string
Description string
SiteName string
URL string
Images []string
}
func Scrape2(url string) (*ScrapeResult, error) {
var (
res *ScrapeResult
scrapeErr error
done = make(chan bool, 1)
c = colly.NewCollector()
)
c.OnError(func(r *colly.Response, err error) {
if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{userAgent},
"Accept": []string{"*/*"},
},
)
} else {
scrapeErr = err
done <- true
}
})
c.OnHTML("html", func(e *colly.HTMLElement) {
spew.Dump("ON HTML")
res = &ScrapeResult{URL: url}
res.Title = FindTitle(e)
res.Description = FindDescription(e)
res.SiteName = FindSiteName(e)
res.Images = FindImages(e)
done <- true
})
c.Request(
"GET",
url,
nil,
nil,
http.Header{
"User-Agent": []string{fbUserAgent},
"Accept": []string{"*/*"}, // * / *
"Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
"Accept-Encoding": []string{"gzip, deflate, br"},
"Connection": []string{"keep-alive"},
"sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
},
)
<- done
return res, scrapeErr
}
func FindTitle(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindDescription(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindSiteName(e *colly.HTMLElement) string {
if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
return content
}
return ""
}
func FindImages(e *colly.HTMLElement) []string {
images := make([]string, 0)
if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
images = append(images, content)
}
return images
}
如何第二次发出 colly 请求并触发 onHTML?谢谢你