0

我正在尝试使用 Colly 在 Go 中编写一个简单的网络爬虫。该程序应该访问 yahoo Finance 上特定日期范围的收益日历,然后螺旋出来并访问列表中显示的每个 Stock Ticker 页面。然后,抓取工具应该收集每个单独的代码页面上的特定信息。

我试图获取的三条信​​息是 3 个 SVG 标签上的样式,这些标签填充在各个股票行情页面上。这些 SVG 分别表示股票的短期、中期和长期前景(我正在根据应用于 SVG 的 css 样式来解释每个“前景”的含义)。

当我运行我的程序时,Colly 将抓取并仅捕获大约 30% 的股票行情页面的 SVG 标签的样式。其余的都空着回来。每次我运行程序时,SVG 样式实际填充的股票列表都是不同的。为什么会这样?请参阅下面的示例输出和 src 代码:

package main

import (
    "fmt"
    "github.com/gocolly/colly/v2"
    "log"
    "strings"
    "sync"
    "time"
)

type Stock struct {
    Url            string
    TickerSymbol   string
    CompanyName    string
    PerformanceOutlookShort string
    PerformanceOutlookMid string
    PerformanceOutlookLong string
}

var mu sync.Mutex

func main() {
    var stockData = make(map [string]*Stock)

    c := colly.NewCollector(
        colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"),
        colly.AllowedDomains("finance.yahoo.com"),
        colly.MaxBodySize(0),
        colly.AllowURLRevisit(),
        colly.Async(true),
    )

    // Set max Parallelism and introduce a Random Delay
    c.Limit(&colly.LimitRule{
        DomainGlob: "*",
        Parallelism: 2,
        Delay: 500 * time.Millisecond,
    })

    log.Println("User Agent: ", c.UserAgent)

    // Before making a request print "Visiting ..."
    c.OnRequest(func(r *colly.Request) {
        log.Println("Visiting", r.URL.String())

    })

    // Unique Identifier for the Earnings Page
    // Collect the list of stocks, initializing each one in a map
    c.OnHTML(`.simpTblRow`, func(e *colly.HTMLElement){
        temp := Stock{}
        temp.CompanyName = e.ChildText(`td[aria-label="Company"]`)
        temp.TickerSymbol = e.ChildText(`td[aria-label="Symbol"]`)
        temp.Url =e.ChildAttr("td>a", "href")
        stockData[temp.TickerSymbol] = &temp
        e.Request.Visit("https://finance.yahoo.com" + temp.Url)
    })

    // On each stock ticker page collect the relevant information and update each map item
    c.OnHTML(".finance.US", func(e *colly.HTMLElement) {

        // Skip this callback if we are on the earnings calendar page
        if !strings.Contains(e.Request.URL.Path, "/quote/"){
            return
        }

        // Get the current stock in the map that matches the one on the ticker page
        stockNameTickerString := e.ChildText("#quote-header-info h1")
        justTicker :=stockNameTickerString[strings.Index(stockNameTickerString, "(")+1:strings.Index(stockNameTickerString, ")")]
        currStock := stockData[justTicker]
        log.Println(currStock.TickerSymbol)

        
        // The LAST 3 Stats are not always captured for each stock... why?
    

        // Capture Short Term Outlook
        currStock.PerformanceOutlookShort = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:first-child > a svg`, "style")

        // Capture Mid Term Outlook
        currStock.PerformanceOutlookMid = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:nth-child(2)> a svg`, "style")

        // Capture Short Term Outlook
        currStock.PerformanceOutlookLong = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:nth-child(3)> a svg`, "style")

    })

    c.Visit("https://finance.yahoo.com/calendar/earnings?from=2021-02-28&to=2021-03-06&day=2021-03-04")
    c.Wait()

    for _, v := range stockData {
        fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookShort)
        fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookMid)
        fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookLong)
    }
}

Sample Output:

FENC  -  
FENC  -  
FENC  -  
GMRE  -  
GMRE  -  
GMRE  -  
SBOW  -  
SBOW  -  
SBOW  -  
NMM  -  
NMM  -  
NMM  -  
WB  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
WB  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
WB  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
VSEC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
VSEC  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
VSEC  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
GEENQ  -  
GEENQ  -  
GEENQ  -  
TZOO  -  
TZOO  -  
TZOO  -  
FCRD  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FCRD  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FCRD  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SYNC  -  
SYNC  -  
SYNC  -  
RNET  -  
RNET  -  
RNET  -  
INFI  -  
INFI  -  
INFI  -  
GCP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
GCP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
GCP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FRPH  -  
FRPH  -  
FRPH  -  
SIGA  -  
SIGA  -  
SIGA  -  
NWPX  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NWPX  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
NWPX  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
MICT  -  
MICT  -  
MICT  -  
SLNO  -  
SLNO  -  
SLNO  -  
AAMC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AAMC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AAMC  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
PKDC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
PKDC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
PKDC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NWLI  -  
NWLI  -  
NWLI  -  
MNDO  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
MNDO  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
MNDO  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FTSI  -  
FTSI  -  
FTSI  -  
AIKI  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIKI  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
AIKI  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CULP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CULP  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
CULP  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
HURC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
HURC  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
HURC  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
EQ  -  
EQ  -  
EQ  -  
HRTG  -  
HRTG  -  
HRTG  -  
SALM  -  
SALM  -  
SALM  -  
OXFD  -  
OXFD  -  
OXFD  -  
CMTL  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CMTL  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CMTL  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
DXYN  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
DXYN  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
DXYN  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SD  -  
SD  -  
SD  -  
BXMX  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXMX  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
BXMX  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
BXC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXC  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXC  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
SHIP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
SHIP  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SHIP  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NRP  -  
NRP  -  
NRP  -  
GWRS  -  
GWRS  -  
GWRS  -  
WPG  -  
WPG  -  
WPG  -  
MLR  -  
MLR  -  
MLR  -  
OXSQ  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
OXSQ  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
OXSQ  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
UTF  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
UTF  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
UTF  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SMHI  -  
SMHI  -  
SMHI  -  
MNKKQ  -  
MNKKQ  -  
MNKKQ  -  
FIZZ  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FIZZ  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FIZZ  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
RLH  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
RLH  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
RLH  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
YY  -  
YY  -  
YY  -  
ANFIF  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
ANFIF  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
ANFIF  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CIA  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CIA  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CIA  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SFE  -  
SFE  -  
SFE  -  
APT  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
APT  -  fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
APT  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FLMN  -  fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FLMN  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FLMN  -  fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
4

0 回答 0