我在这里解析了您作为示例提供的文件。我首先将数据从文件复制到一个txt文件。该文件copied.txt
需要位于当前工作目录中。这可以让你知道如何进行。
library(tidyverse)
df <- read_file("copied.txt") %>%
# trying to extract data only from the table
(function(x){
tbl_beg <- str_locate(x, "Managers Sole")[2] + 1
tbl_end <- str_locate(x, "\r\n</TABLE>")[1]
str_sub(x, tbl_beg, tbl_end)
}) %>%
# removing some unwanted characters from the beginning and the end of the extracted string
str_sub(start = 4, end = -3) %>%
# splitting for individual lines
str_split('\"\r\n\"') %>% unlist() %>%
# removing broken line break
str_remove("\r\n") %>%
# replacing the original text where there are spaces with one, where there is underscore
# the reason for that is that I need to split the rows into columns using space
str_replace_all("Sole Managers Sole", " Managers_Sole") %>%
# removing extra spaces
str_squish() %>%
# reversing the order of the line (I need to split from the right because the company name contains additional spaces)
# if the company name is the last one, it is okey that there are additional spaces
stringi::stri_reverse() %>%
str_split(pattern = " ", n = 6, simplify = T) %>%
# making the order to the original one
apply(MARGIN = 2, FUN = stringi::stri_reverse) %>%
as_tibble() %>%
select(c(6:1)) %>%
set_names(nm = c("name_of_issuer", "title_of_cl", "cusip_number", "fair_market_value", "shares", "shares_of_princip_mngrs"))
# A tibble: 47 x 6
name_of_issuer title_of_cl cusip_number fair_market_value shares shares_of_princip_mngrs
<chr> <chr> <chr> <chr> <chr> <chr>
1 America Online COM 02364J104 2,940,000 20,000 Managers_Sole
2 Anheuser Busch COM 35229103 3,045,000 40,000 Managers_Sole
3 At Home COM 45919107 787,500 5,000 Managers_Sole
4 AT&T COM 1957109 5,985,937 75,000 Managers_Sole
5 Bank Toyko COM 65379109 700,000 50,000 Managers_Sole
6 Bay View Capital COM 07262L101 14,958,437 792,500 Managers_Sole
7 Broadcast.com COM 111310108 2,954,687 25,000 Managers_Sole
8 Chase Manhattan COM 16161A108 10,578,750 130,000 Managers_Sole
9 Chase Manhattan 4/85C 16161A9DQ 59,375 500 Managers_Sole
10 Cisco Systems COM 17275R102 4,930,312 45,000 Managers_Sole