我正在尝试使用 Colly 在 Go 中编写一个简单的网络爬虫。该程序应该访问 yahoo Finance 上特定日期范围的收益日历,然后螺旋出来并访问列表中显示的每个 Stock Ticker 页面。然后,抓取工具应该收集每个单独的代码页面上的特定信息。
我试图获取的三条信息是 3 个 SVG 标签上的样式,这些标签填充在各个股票行情页面上。这些 SVG 分别表示股票的短期、中期和长期前景(我正在根据应用于 SVG 的 css 样式来解释每个“前景”的含义)。
当我运行我的程序时,Colly 将抓取并仅捕获大约 30% 的股票行情页面的 SVG 标签的样式。其余的都空着回来。每次我运行程序时,SVG 样式实际填充的股票列表都是不同的。为什么会这样?请参阅下面的示例输出和 src 代码:
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
"log"
"strings"
"sync"
"time"
)
type Stock struct {
Url string
TickerSymbol string
CompanyName string
PerformanceOutlookShort string
PerformanceOutlookMid string
PerformanceOutlookLong string
}
var mu sync.Mutex
func main() {
var stockData = make(map [string]*Stock)
c := colly.NewCollector(
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 11_2_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"),
colly.AllowedDomains("finance.yahoo.com"),
colly.MaxBodySize(0),
colly.AllowURLRevisit(),
colly.Async(true),
)
// Set max Parallelism and introduce a Random Delay
c.Limit(&colly.LimitRule{
DomainGlob: "*",
Parallelism: 2,
Delay: 500 * time.Millisecond,
})
log.Println("User Agent: ", c.UserAgent)
// Before making a request print "Visiting ..."
c.OnRequest(func(r *colly.Request) {
log.Println("Visiting", r.URL.String())
})
// Unique Identifier for the Earnings Page
// Collect the list of stocks, initializing each one in a map
c.OnHTML(`.simpTblRow`, func(e *colly.HTMLElement){
temp := Stock{}
temp.CompanyName = e.ChildText(`td[aria-label="Company"]`)
temp.TickerSymbol = e.ChildText(`td[aria-label="Symbol"]`)
temp.Url =e.ChildAttr("td>a", "href")
stockData[temp.TickerSymbol] = &temp
e.Request.Visit("https://finance.yahoo.com" + temp.Url)
})
// On each stock ticker page collect the relevant information and update each map item
c.OnHTML(".finance.US", func(e *colly.HTMLElement) {
// Skip this callback if we are on the earnings calendar page
if !strings.Contains(e.Request.URL.Path, "/quote/"){
return
}
// Get the current stock in the map that matches the one on the ticker page
stockNameTickerString := e.ChildText("#quote-header-info h1")
justTicker :=stockNameTickerString[strings.Index(stockNameTickerString, "(")+1:strings.Index(stockNameTickerString, ")")]
currStock := stockData[justTicker]
log.Println(currStock.TickerSymbol)
// The LAST 3 Stats are not always captured for each stock... why?
// Capture Short Term Outlook
currStock.PerformanceOutlookShort = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:first-child > a svg`, "style")
// Capture Mid Term Outlook
currStock.PerformanceOutlookMid = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:nth-child(2)> a svg`, "style")
// Capture Short Term Outlook
currStock.PerformanceOutlookLong = e.ChildAttr(`#chrt-evts-mod > div:nth-child(3) > ul > li:nth-child(3)> a svg`, "style")
})
c.Visit("https://finance.yahoo.com/calendar/earnings?from=2021-02-28&to=2021-03-06&day=2021-03-04")
c.Wait()
for _, v := range stockData {
fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookShort)
fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookMid)
fmt.Println(v.TickerSymbol, " - ", v.PerformanceOutlookLong)
}
}
Sample Output:
FENC -
FENC -
FENC -
GMRE -
GMRE -
GMRE -
SBOW -
SBOW -
SBOW -
NMM -
NMM -
NMM -
WB - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
WB - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
WB - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
VSEC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
VSEC - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
VSEC - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
GEENQ -
GEENQ -
GEENQ -
TZOO -
TZOO -
TZOO -
FCRD - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FCRD - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FCRD - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SYNC -
SYNC -
SYNC -
RNET -
RNET -
RNET -
INFI -
INFI -
INFI -
GCP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
GCP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
GCP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FRPH -
FRPH -
FRPH -
SIGA -
SIGA -
SIGA -
NWPX - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NWPX - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
NWPX - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
MICT -
MICT -
MICT -
SLNO -
SLNO -
SLNO -
AAMC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AAMC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AAMC - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
PKDC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
PKDC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
PKDC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NWLI -
NWLI -
NWLI -
MNDO - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
MNDO - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
MNDO - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FTSI -
FTSI -
FTSI -
AIKI - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIKI - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
AIKI - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CULP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CULP - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
CULP - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
HURC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
HURC - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
HURC - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
EQ -
EQ -
EQ -
HRTG -
HRTG -
HRTG -
SALM -
SALM -
SALM -
OXFD -
OXFD -
OXFD -
CMTL - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CMTL - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CMTL - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
DXYN - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
DXYN - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
DXYN - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SD -
SD -
SD -
BXMX - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXMX - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
BXMX - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
BXC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXC - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
BXC - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
SHIP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
SHIP - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SHIP - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
NRP -
NRP -
NRP -
GWRS -
GWRS -
GWRS -
WPG -
WPG -
WPG -
MLR -
MLR -
MLR -
OXSQ - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
OXSQ - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
OXSQ - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
UTF - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
UTF - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
UTF - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SMHI -
SMHI -
SMHI -
MNKKQ -
MNKKQ -
MNKKQ -
FIZZ - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FIZZ - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FIZZ - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
RLH - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
RLH - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
RLH - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
YY -
YY -
YY -
ANFIF - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
ANFIF - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
ANFIF - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
CIA - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CIA - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
CIA - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
SFE -
SFE -
SFE -
APT - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
APT - fill:#464e56;stroke:#464e56;stroke-width:0;vertical-align:bottom;
APT - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
AIV - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FLMN - fill:#ff4d52;stroke:#ff4d52;stroke-width:0;vertical-align:bottom;
FLMN - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;
FLMN - fill:#1ac567;stroke:#1ac567;stroke-width:0;vertical-align:bottom;