我调整了 scrapy_stocks 函数以适应 Yahoo 页面更新。我还没有彻底审查过这个解决方案,但到目前为止,它似乎在我所有的试验中都运行良好。请注意两点:
- 如果您拥有 Yahoo Premium,我认为这不会起作用。我没有它,所以我无法测试它。但如果你这样做了,更新应该不会太难。
- 我对 rvest 没有太多经验,但由于页面的性质,它必须设置函数,以便如果缺少一个值,则整行都丢失。
尝试这个:
scrapy_stocks2 <- function(stock){
if ("rvest" %in% installed.packages()) {
library(rvest)
}else{
install.packages("rvest")
library(rvest)
}
if ("xml2" %in% installed.packages()) {
library(xml2)
}else{
install.packages("xml2")
library(xml2)
}
for (stocknum in 1:length(stock)) {
tryCatch(
{
# Income Statement
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/financials?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:6
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 5, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-4)) {
t_ind <- !lab_inds[i:(i+4)]
if (sum(t_ind) == 5) {
data[row_num, 1:5] <- as.numeric(yh_data[i:(i+4)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp1 <- as.data.frame(data)
print(paste(stock[stocknum],' Income Statement Success'))
# Balance Sheet
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/balance-sheet?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]/div[1]/div[1]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:5
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 4, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-3)) {
t_ind <- !lab_inds[i:(i+3)]
if (sum(t_ind) == 4) {
data[row_num, 1:4] <- as.numeric(yh_data[i:(i+3)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp2 <- as.data.frame(data)
print(paste(stock[stocknum],' Balance Sheet Success'))
# Cash Flow
url <- "https://finance.yahoo.com/quote/"
url <- paste0(url,stock[stocknum],"/cash-flow?p=",stock[stocknum])
wahis.session <- html_session(url)
nodes <- wahis.session %>%
html_nodes(xpath = '//*[@id="Col1-1-Financials-Proxy"]/section/div[4]/div[1]/div[1]//span')
yh_data <- nodes %>%
xml_text() %>%
gsub(pattern = ',', replacement = '')
colnums <- 1:6
col_nms <- yh_data[colnums]
yh_data <- yh_data[-colnums]
lab_inds <- nodes %>%
html_attr(name = 'class') == "Va(m)"
lab_inds[is.na(lab_inds)] <- FALSE
lab_inds <- lab_inds[-colnums]
data <- matrix(NA, nrow = sum(lab_inds), ncol = 5, dimnames = list(yh_data[lab_inds], col_nms[-1]))
row_num <- 1
for (i in 2:(length(lab_inds)-4)) {
t_ind <- !lab_inds[i:(i+4)]
if (sum(t_ind) == 5) {
data[row_num, 1:5] <- as.numeric(yh_data[i:(i+4)])
}
if (lab_inds[i]) {
row_num <- row_num+1
}
}
temp3 <- as.data.frame(data)
print(paste(stock[stocknum],' Cash Flow Statement Success'))
assign(paste0(stock[stocknum],'.f'),value = list(IS = temp1,BS = temp2,CF = temp3),envir = parent.frame())
},
error = function(cond){
message(stock[stocknum], "Give error ",cond)
}
)
}
}