0

我有这个爬虫库,如果第一个用户代理返回错误,我想更改我的用户代理,但是这段代码不起作用,如果第一个用户代理不起作用,我已经发送了第二次尝试,但这永远不会完成,因为 onHTML 不是触发:

package scraper

import (
    "net/http"
    "github.com/davecgh/go-spew/spew"
    "github.com/gocolly/colly"
)

const (
        fbUserAgent = "ua 1"
    userAgent = "ua 2"
)

type ScrapeResult struct {
    Title       string
    Description string
    SiteName    string
    URL         string
    Images      []string
}

func Scrape2(url string) (*ScrapeResult, error) {
    var (
        res *ScrapeResult
        scrapeErr error
        done = make(chan bool, 1)
        c = colly.NewCollector()
    )

    c.OnError(func(r *colly.Response, err error) {
        if ua := r.Request.Headers.Get("User-Agent"); ua == fbUserAgent {
            c.Request(
                "GET",
                url,
                nil,
                nil,
                http.Header{
                    "User-Agent": []string{userAgent},
                    "Accept": []string{"*/*"}, 
                },
            )
        } else {
            scrapeErr = err
            done <- true
        }
    })

    c.OnHTML("html", func(e *colly.HTMLElement) {
        spew.Dump("ON HTML")
        res = &ScrapeResult{URL: url}
        res.Title = FindTitle(e)
        res.Description = FindDescription(e)
        res.SiteName = FindSiteName(e)
        res.Images = FindImages(e)
        done <- true
    })

    c.Request(
        "GET",
        url,
        nil,
        nil,
        http.Header{
            "User-Agent": []string{fbUserAgent},
            "Accept": []string{"*/*"}, // * / *
            "Accept-Language": []string{"en-GB,en-US;q=0.9,en;q=0.8"},
            "Accept-Encoding": []string{"gzip, deflate, br"},
            "Connection": []string{"keep-alive"},
            "sec-ch-ua": []string{` Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90`},
        },
    )

    <- done
    return res, scrapeErr
}

func FindTitle(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:title"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindDescription(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:description"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindSiteName(e *colly.HTMLElement) string {
    if content := e.ChildAttr(`meta[property="og:site_name"]`, "content"); len(content) > 0 {
        return content
    }
    return ""
}

func FindImages(e *colly.HTMLElement) []string {
    images := make([]string, 0)
    if content := e.ChildAttr(`meta[property="og:image"]`, "content"); len(content) > 0 {
        images = append(images, content)
    }
    return images
}

如何第二次发出 colly 请求并触发 onHTML?谢谢你

4

1 回答 1

0

可以设置属性collector.CheckHead = true

这样做是确保您首先执行 GetHEAD 操作以检查连接问题,如果失败 - 将重试。

您将需要 gocolly 的 /v2 才能包含此功能。

https://github.com/gocolly/colly/blob/master/colly.go#L110

于 2022-01-29T07:31:32.787 回答