r - 从多个 URL 自动下载，处理反馈对话框 / cookie

Question

我尝试从该站点批量下载 CORINE Landcover zip 文件，但我认为来自 Google Analytics 的相互关联的反馈调查窗口在这里造成了麻烦。这些问题是否有某种 Curl 处理程序？

我在 sqlite manager 中查看了 cookie，这些是条目：baseDomain = "europa.eu", name = "clc06_c133.zip", value = "sectors%3Denvironment", host = "www.eea.europa.eu",。 . cookie 路径是“C:\Users\Kay\AppData\Roaming\Mozilla\Firefox\Profiles\ckut8fjm.default\cookies.sqlite”

setwd("D:/GIS DataBase/CorineLC/")

mylist <- list(
clc06_1 <- "111 - Continuous urban fabric",
clc06_2 <- "112 - Discontinuous urban fabric",
clc06_3 <- "121 - Industrial or commercial units",
clc06_4 <- "122 - Road and rail networks and associated land",
clc06_5 <- "123 - Port areas",
clc06_6 <- "124 - Airports",
clc06_7 <- "131 - Mineral extraction sites",
clc06_8 <- "132 - Dump sites",
clc06_9 <- "133 - Construction sites",
clc06_10 <- "141 - Green urban areas",
clc06_11 <- "142 - Sport and leisure facilities",
clc06_12 <- "211 - Non-irrigated arable land",
clc06_13 <- "212 - Permanently irrigated land",
clc06_14 <- "213 - Rice fields",
clc06_15 <- "221 - Vineyards",
clc06_16 <- "222 - Fruit trees and berry plantations",
clc06_17 <- "223 - Olive groves",
clc06_18 <- "231 - Pastures",
clc06_19 <- "241 - Annual crops associated with permanent crops",
clc06_20 <- "242 - Complex cultivation patterns",
clc06_21 <- "243 - Land principally occupied by agriculture, with significant areas of natural vegetation",
clc06_22 <- "244 - Agro-forestry areas",
clc06_23 <- "311 - Broad-leaved forest",
clc06_24 <- "312 - Coniferous forest",
clc06_25 <- "313 - Mixed forest",
clc06_26 <- "321 - Natural grasslands",
clc06_27 <- "322 - Moors and heathland",
clc06_28 <- "323 - Sclerophyllous vegetation",
clc06_29 <- "324 - Transitional woodland-shrub",
clc06_30 <- "331 - Beaches, dunes, sands",
clc06_31 <- "332 - Bare rocks",
clc06_32 <- "333 - Sparsely vegetated areas",
clc06_33 <- "334 - Burnt areas",
clc06_34 <- "335 - Glaciers and perpetual snow",
clc06_35 <- "411 - Inland marshes",
clc06_36 <- "412 - Peat bogs",
clc06_37 <- "421 - Salt marshes",
clc06_38 <- "422 - Salines",
clc06_39 <- "423 - Intertidal flats",
clc06_40 <- "511 - Water courses",
clc06_41 <- "512 - Water bodies",
clc06_42 <- "521 - Coastal lagoons",
clc06_43 <- "522 - Estuaries",
clc06_44 <- "523 - Sea and ocean")

# extract the CLC codes which are the 3-digit number in the string:
foo1 <- function(x) unlist(strsplit(x, " - "))[[1]]
# and the names
foo2 <- function(x) unlist(strsplit(x, " - "))[[2]]

codes <- sapply(mylist, foo1, simplify = T)
names <- sapply(mylist, foo2, simplify = T)

# make urls
names_url <- gsub(",", "",  gsub("\\s", "-", names))
dl_url <- paste0("http://www.eea.europa.eu/data-and-maps/data/clc-2006-vector-data-version-2/",
                 codes, "-", names_url, "/clc06_c", codes, ".zip/at_download/file")

# get zip file names
get_zip_name <- function(x) unlist(strsplit(x, "/"))[grep(".zip", unlist(strsplit(x, "/")))]

# function to plug into sapply
dl_url_to_zip <- function(x) download.file(x, dest = get_zip_name(x))

# gives http status 404!
sapply(dl_url, dl_url_to_zip)

score 4 · Accepted Answer

您可以使用该httr软件包：

require(httr)
require(XML)

response <- GET("http://www.eea.europa.eu/data-and-maps/data/clc-2006-vector-data-version-2")
doc <- htmlParse(content(response,as="text"))
files <- xpathSApply(doc,'//*/a[contains(@href,"http://www.eea.europa.eu/data-and-maps/data/clc-2006-vector-data-version-2/")]/@href')
files <- files[-c(1,47:50)]

files[10]
#href 
#"http://www.eea.europa.eu/data-and-maps/data/clc-2006-vector-data-version-2/141-green-urban-areas/clc06_c141.zip/at_download/file" 
dl_url[10]
#[1] "http://www.eea.europa.eu/data-and-maps/data/clc-2006-vector-data-version-2/141-Green-urban-areas/clc06_c141.zip/at_download/file"

注意你有大写Green-urban-areas而不是green-urban-areas

response <- GET(files[10])
writeBin(content(response),'test.zip')

我从httrpackage 开始，因为我认为我们可能需要 cookie。实际上

download.file(files[10],'test.zip',mode="wb")

也会同样有效。原始代码中的错误是大写。

r - 从多个 URL 自动下载，处理反馈对话框 / cookie

1 回答 1

Related

Reference