我有几个 pdf,我希望提取股东表。如何指定只有出现在字符串 'TWENTY LARGEST SHAREHOLDERS' 之后的表被提取?
我试过但不太确定功能部分。
library("pdftools")
library("tidyverse")
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table <- function(table){
table <- str_split(table, "\n", simplify = TRUE)
table_start <- stringr::str_which(table, "TWENTY LARGEST SHAREHOLDERS")
table <- table[1, (table_start +1 ):(table_end - 1)]
table <- str_replace_all(table, "\\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
}
shares <- map_df(raw_text, clean_table)