当表这么干净时,Java 部门——虽然变得更容易处理——就不是必需的了。只需稍加处理字符串即可获得所需的东西:
library(pdftools)
library(stringi)
library(tidyverse)
# read it with pdftools
book <- pdf_text("global-wealth-databook.pdf")
# go to the page
lines <- stri_split_lines(book[[113]])[[1]]
# remove footer
lines <- discard(lines, stri_detect_fixed, "Credit Suisse")
# find line before start of table
start <- last(which(stri_detect_regex(lines, "^[[:space:]]+")))+1
# find line after table
end <- last(which(lines == ""))-1
# smuch into something read.[table|csv] can read
tab <- paste0(stri_replace_all_regex(lines[start:end], "[[:space:]][[:space:]]+", "\t"), collapse="\n")
#read it
read.csv(text=tab, header=FALSE, sep="\t", stringsAsFactors = FALSE)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10
## 1 Egypt 56,036 3,168 324 98.1 1.7 0.2 0.0 100.0 91.7
## 2 El Salvador 3,957 14,443 6,906 66.0 32.8 1.2 0.0 100.0 65.7
## 3 Equatorial Guinea 670 8,044 2,616 87.0 12.2 0.7 0.1 100.0 77.3
## 4 Eritrea 2,401 3,607 2,036 94.5 5.4 0.1 100.0 57.1 NA
## 5 Estonia 1,040 43,158 27,522 22.5 72.2 5.1 0.2 100.0 56.4
## 6 Ethiopia 49,168 153 103 100.0 0.0 100.0 43.4 NA NA
## 7 Fiji 568 6,309 3,059 85.0 14.6 0.4 0.0 100.0 68.2
## 8 Finland 4,312 159,098 57,850 30.8 33.8 33.5 1.9 100.0 76.7
## 9 France 49,239 263,399 119,720 25.3 21.4 49.3 4.0 100.0 70.2
## 10 Gabon 1,098 15,168 7,367 62.0 36.5 1.5 0.0 100.0 68.4
## 11 Gambia 904 898 347 99.2 0.7 0.0 100.0 72.4 NA
## 12 Georgia 2,950 19,430 9,874 50.7 47.6 1.6 0.1 100.0 66.8
## 13 Germany 67,244 203,946 47,091 29.5 33.7 33.9 2.9 100.0 79.1
## 14 Ghana 14,574 809 411 99.5 0.5 0.0 100.0 66.1 NA
## 15 Greece 9,020 111,684 54,665 20.7 52.9 25.4 1.0 100.0 67.7
## 16 Grenada 70 17,523 4,625 74.0 24.3 1.5 0.2 100.0 81.5
## 17 Guinea 5,896 814 374 99.4 0.6 0.0 100.0 69.7 NA
## 18 Guinea-Bissau 884 477 243 99.8 0.2 100.0 65.6 NA NA
## 19 Guyana 467 5,345 2,510 89.0 10.7 0.3 0.0 100.0 67.2
## 20 Haiti 6,172 2,879 894 96.2 3.6 0.2 0.0 100.0 76.9
## 21 Hong Kong 6,172 193,248 46,079 26.3 50.9 20.9 1.9 100.0 85.1
## 22 Hungary 7,846 39,813 30,111 11.8 83.4 4.8 0.0 100.0 45.3
## 23 Iceland 245 587,649 444,999 13.0 72.0 15.0 100.0 46.7 NA
## 24 India 834,608 5,976 1,295 92.3 7.2 0.5 0.0 100.0 83.0
## 25 Indonesia 167,559 11,001 1,914 81.9 17.0 1.1 0.1 100.0 83.7
## 26 Iran 56,306 3,831 1,856 94.1 5.7 0.2 0.0 100.0 67.3
## 27 Ireland 3,434 248,466 84,592 31.2 22.7 42.3 3.6 100.0 81.3
## 28 Israel 5,315 198,406 78,244 22.3 38.7 36.7 2.3 100.0 74.2
## 29 Italy 48,544 223,572 124,636 21.3 22.0 54.1 2.7 100.0 66.0
## 30 Jamaica 1,962 9,485 3,717 79.0 20.2 0.8 0.0 100.0 74.3
## 31 Japan 105,228 225,057 123,724 7.9 35.7 53.9 2.6 100.0 60.9
## 32 Jordan 5,212 13,099 6,014 65.7 33.1 1.2 0.0 100.0 76.1
## 33 Kazakhstan 12,011 4,441 334 97.6 2.1 0.3 0.0 100.0 92.6
## 34 Kenya 23,732 1,809 662 97.4 2.5 0.1 0.0 100.0 77.2
## 35 Korea 41,007 160,609 67,934 20.0 40.5 37.8 1.7 100.0 70.0
## 36 Kuwait 2,996 97,304 37,788 30.3 48.3 20.4 1.0 100.0 76.9
## 37 Kyrgyzstan 3,611 4,689 2,472 92.7 7.0 0.2 0.0 100.0 62.9
## 38 Laos 3,849 5,662 1,382 94.6 4.7 0.7 0.0 100.0 84.9
## 39 Latvia 1,577 27,631 17,828 29.0 68.6 2.2 0.1 100.0 53.6
## 40 Lebanon 4,085 24,161 6,452 69.0 28.5 2.3 0.2 100.0 82.0
## 41 Lesotho 1,184 3,163 945 95.9 3.8 0.3 0.0 100.0 79.8
## 42 Liberia 2,211 2,193 959 97.3 2.6 0.1 0.0 100.0 71.6
## 43 Libya 4,007 45,103 24,510 29.6 61.1 9.2 0.2 100.0 59.9
## 44 Lithuania 2,316 27,507 17,931 27.3 70.4 2.1 0.1 100.0 51.6
## 45 Luxembourg 450 313,687 167,664 17.0 20.0 58.8 4.2 100.0 68.1
## 46 Macedonia 1,607 9,044 5,698 77.0 22.5 0.5 0.0 100.0 56.4
更新
这更通用,但您仍然需要进行一些手动清理。我认为即使您使用 Tabula 也会如此。
library(pdftools)
library(stringi)
library(tidyverse)
# read it with pdftools
book <- pdf_text("~/Downloads/global-wealth-databook.pdf")
transcribe_page <- function(book, pg) {
# go to the page
lines <- stri_split_lines(book[[pg]])[[1]]
# remove footer
lines <- discard(lines, stri_detect_fixed, "Credit Suisse")
# find line before start of table
start <- last(which(stri_detect_regex(lines, "^[[:space:]]+")))+1
# find line after table
end <- last(which(lines == ""))-1
# get the target rows
rows <- lines[start:end]
# map out where data values are
stri_replace_first_regex(rows, "([[:alpha:]]) ([[:alpha:]])", "$1_$2") %>%
stri_replace_all_regex("[^[:blank:]]", "X") %>%
map(~rle(strsplit(.x, "")[[1]])) -> pos
# compute the number of data fields
nfields <- ceiling(max(map_int(pos, ~length(.x$lengths))) / 2)
# do our best to get them into columns
data_frame(rec = rows) %>%
separate(rec, into=sprintf("X%s", 1:nfields), sep="[[:space:]]{2,}", fill="left") %>%
print(n=length(rows))
}
transcribe_page(book, 112)
transcribe_page(book, 113)
transcribe_page(book, 114)
transcribe_page(book, 115)
看看 ^^ 的输出。它们的形状并不糟糕,一些清理可以是程序化的。