7

我经常使用来自 EUROSTAT 的数据,发现无法将数据直接加载到 R 中非常烦人。我编写了这个片段来获取 EUROSTAT http://epp.eurostat.ec 的批量下载工具提供的任何数据集。 europa.eu/NavTree_prod/everybody/BulkDownloadListing?sort=1&dir=dic%2Fen

有没有更好的办法?..这个对我有用:

    #this library is used to download data from eurostat and to find datasets
#later extend to extend to find datasets with certain dimensions

#download data from eurostat
#unpack and convert to dataframe
#load label descriptions
#load factors
#save as r data object

datasetname="ebd_all"

LANGUAGE="en"

install.packages("RCurl")
library(RCurl)
library(data.table)
library(reshape)
library(stringr)

baseurl="http://epp.eurostat.ec.europa.eu/NavTree_prod/everybody/BulkDownloadListing?sort=1&file=data%2F"

fullfilename=paste(datasetname,".tsv.gz",sep="")
temp <- paste(tempfile(),".gz",sep="")
download.file(paste(baseurl,fullfilename,sep=""),temp)
dataconnection <- gzfile(temp)
d=read.delim(dataconnection)
longdata=melt(d,id=colnames(d)[1])

firstname=colnames(d)[1] # remove .time and count how many headings are there 
firstname=substr(firstname,1,nchar(firstname)-nchar(".time"))
headings=toupper(strsplit(firstname,".",fixed=TRUE)[[1]])
headingcount=length(headings)
colnames(longdata)=c("dimensions","time","value")


#get the data on the dimension tables
df=data.frame(dimensions=as.character(longdata[,"dimensions"]))
df = transform(df, dimensions= colsplit(dimensions, split = "\\,",names=headings))
dimensions=data.table(df$dimensions)

#download the dimension labels - save headings as better variable
dimfile=paste("http://epp.eurostat.ec.europa.eu/NavTree_prod/everybody/BulkDownloadListing?sort=1&file=dic%2F",LANGUAGE,"%2Fdimlst.dic",sep="")

temp <- paste(tempfile(),".gz",sep="")
download.file(dimfile,temp)
dataconnection <- gzfile(temp)
dimdata=read.delim(dataconnection,header=FALSE)
colnames(dimdata)=c("colname","desc")
lab=dimdata$desc
names(lab)=dimdata$colname

#create  headings that speak for themselves for columns
speakingheadings=as.character(lab[headings])

#download factors for each heading and add
for(heading in headings){
  factorfile=paste("http://epp.eurostat.ec.europa.eu/NavTree_prod/everybody/BulkDownloadListing?sort=1&file=dic%2F",LANGUAGE,"%2F",tolower(heading),".dic",sep="")
  temp <- paste(tempfile(),".gz",sep="")
  download.file(factorfile,temp)
  dataconnection <- gzfile(temp)
  factordata=read.delim(dataconnection,header=FALSE)
  colnames(factordata)=c(heading,paste(heading,"_desc",sep=""))
  #join the heading to the heading dataset
  dimensions=merge(dimensions,factordata,by=heading,all.x=TRUE)
}


  #at the end at speaking headings
setnames(dimensions,colnames(dimensions)[1:length(speakingheadings)],speakingheadings)

  #add data columns by writing and reading again---FASTER ;-)
temp=tempfile()
values=data.frame(value=as.character(longdata$value))
values = transform(values, value= colsplit(value, split = "\\ ",names=c("value","flag")))
values=values$value
values=data.table(values)

values$value=as.character(values$value)
values$flag=as.character(values$flag)
values[value==flag,flag:=NA]
values$value=as.double(values$value)

eurostatdata=cbind(dimensions,time=longdata$time,values) 
save(eurostatdata,file=paste(datasetname,".RData"))
4

3 回答 3

7

Check out SmarterPoland package, there are functions to download (and get into R) data directly from EUROSTAT.

here is example:

library(SmarterPoland)
# info about passagers
grepEurostatTOC("split of passenger transport")
## get table
tmp <- getEurostatRCV("tsdtr210")
summary(tmp)

##     vehicle         geo            time          value     
##  BUS_TOT:756   AT     :  63   1990   : 108   Min.   : 0.0  
##  CAR    :756   BE     :  63   1991   : 108   1st Qu.: 6.9  
##  TRN    :756   BG     :  63   1992   : 108   Median :12.9  
##                CH     :  63   1993   : 108   Mean   :33.6  
##                CY     :  63   1994   : 108   3rd Qu.:77.4  
##                CZ     :  63   1995   : 108   Max.   :93.4  
##                (Other):1890   (Other):1620   NA's   :397

source: www.smarterpoland.pl

于 2012-11-19T13:57:25.600 回答
6

欧盟统计局以 SDMX 格式传播其统计数据。在 R 中,您可以使用rsdmx包来读取它们的数据。请参见下面的示例:

#in case you want to install rsdmx from Github
#(otherwise you can install it from CRAN)
require(devtools)
install_github("rsdmx", "opensdmx")
require(rsdmx)

#read EUROSTAT dataset
dataURL <- "http://ec.europa.eu/eurostat/SDMX/diss-web/rest/data/cdh_e_fos/..PC.FOS1.BE/?startperiod=2005&endPeriod=2011 "
sdmx <- readSDMX(dataURL)
stats <- as.data.frame(sdmx)
head(stats)

注意:您可以从CRAN找到 rsdmx,也可以直接从 GitHub 存储库安装它。https://github.com/opensdmx/rsdmx

rsdmx 不仅可以读取 SDMX datasets,还可以读取所有 SDMX 文档、数据或元数据,包括data structurescodelists. 如果您需要更多示例,我邀请您查看rsdmx wiki 。

于 2014-10-26T13:24:16.233 回答
2

修订答案

RJSDMX也可用于将数据从 Eurostat 检索到 R 中。示例如下所示。

library(RJSDMX)
data <- getTimeSeries("EUROSTAT","nama_gdp_c/.EUR_HAB.B1GM.DE")

初步答案(留在这里供参考)

更好的方法可能是使用Eurostat Web Service 工具,该工具提供对 Eurostat 数据的编程访问。Web Service 工具是 Eurostat 提供的另一项服务——除了批量下载工具——用于从其数据库中提取数据。要使用该服务,您需要发送 REST 或 SOAP 请求。从服务器检索到的数据是一种分层数据结构——一种 XML 文档,可以使用 XML 包对其进行解析。

在查看下面的快速示例之前,我建议阅读一些有关 Eurotat 提供的服务的信息,可以在这里找到:http ://epp.eurostat.ec.europa.eu/portal/page/portal/sdmx_web_services/ getting_started/a_few_useful_points

# Step 0: Load the XML package.
# This is used later on to parse the XML retrieved from Eurostat.
# For a tutorial on XML and parsing XML documents, read this: http://www.w3schools.com/xpath/default.asp

library(XML)

# Step 1: Construct the appropriate REST query.
# First read this:     http://epp.eurostat.ec.europa.eu/portal/page/portal/sdmx_web_services/getting_started/a_few_useful_points

# Specify the data to be retrieved.

resource    <- "data"
dataflow    <- "nama_gdp_c"
key         <- ".EUR_HAB.B1GM.DE"
time_filter <- "?startPeriod=2010"

# Construct the query

partial_url <- paste(paste(resource, dataflow, key, sep="/"), time_filter, sep="")
base_url    <- "http://ec.europa.eu/eurostat/SDMX/diss-web/rest/"
rest_query  <- paste(base_url, partial_url, sep="")

# Step 2: Make the request using cURL (that is, retrieve the data)
# For information about cURL, read this: http://curl.haxx.se/
# For information about the curl command, check out the man pages: http://curl.haxx.se/docs/manpage.html

command   <- paste("curl", rest_query)
raw_data  <- system(command, intern=TRUE)

# Note: at this stage, the data is a character object.

class(raw_data)

# View the data, which can be found commented out at the bottom of this script. Note that it is a hierarchical data structure. 

# Step 3: Parse the data
# Here we use functions from the XML package - one could, of course, use base package functions, but why?

data <- xmlParse(raw_data)

# Parsing the data returns an object of class: "XMLInternalDocument" and "XMLAbstractDocument"

class(data)

# Step 4: Extract the numerical data 

# Data can be found using getNodeSet(), but the data remains stuck between "tags" - we just want the numbers.

getNodeSet(data,"//generic:ObsValue")

# The numbers we want to extract are, in this case, value "attributes". We can target these values as follows.

xpathApply(data, "//generic:ObsValue", xmlGetAttr, name="value")

# That does the job, but really we'd like those numbers in a vector rather than a list object.

numbers <- as.numeric(xpathApply(data, "//generic:ObsValue", xmlGetAttr, name="value"))

# Step 5: Extract the dates (years) - i.e. get some metadata.
# This is similar to above.

years <- as.numeric(xpathApply(data, "//generic:ObsDimension", xmlGetAttr, name="value"))

# Step 6 and on-wards:
#
# Enter your code here... =)
#
#
#

作为参考,原始数据如下所示:

<?xml version="1.0" encoding="utf-8"?>
<message:GenericData xmlns:footer="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message/footer" xmlns:generic="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic" xmlns:common="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common" xmlns:message="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
  <message:Header>
    <message:ID>13e08b8d24936d75b3a6fa1b9c632e22</message:ID>
    <message:Test>false</message:Test>
    <message:Prepared>2014-10-20T21:10:50</message:Prepared>
    <message:Sender id="ESTAT">
      <common:Name xml:lang="en">Eurostat</common:Name>
      <message:Timezone>+01:00</message:Timezone>
    </message:Sender>
    <message:Receiver id="RECEIVER"/>
    <message:Structure structureID="ESTAT_DSD_nama_gdp_c_1_0" dimensionAtObservation="TIME_PERIOD">
      <common:Structure>
        <Ref agencyID="ESTAT" id="DSD_nama_gdp_c" version="1.0"/>
      </common:Structure>
    </message:Structure>
    <message:DataSetAction>Append</message:DataSetAction>
    <message:DataSetID>nama_gdp_c</message:DataSetID>
  </message:Header>
  <message:DataSet structureRef="ESTAT_DSD_nama_gdp_c_1_0">
    <generic:Series>
      <generic:SeriesKey>
        <generic:Value id="UNIT" value="EUR_HAB"/>
        <generic:Value id="INDIC_NA" value="B1GM"/>
        <generic:Value id="GEO" value="DE"/>
        <generic:Value id="FREQ" value="A"/>
      </generic:SeriesKey>
      <generic:Obs>
        <generic:ObsDimension value="2013"/>
        <generic:ObsValue value="33300.0"/>
      </generic:Obs>
      <generic:Obs>
        <generic:ObsDimension value="2012"/>
        <generic:ObsValue value="32600.0"/>
      </generic:Obs>
      <generic:Obs>
        <generic:ObsDimension value="2011"/>
        <generic:ObsValue value="31900.0"/>
      </generic:Obs>
      <generic:Obs>
        <generic:ObsDimension value="2010"/>
        <generic:ObsValue value="30500.0"/>
      </generic:Obs>
    </generic:Series>
  </message:DataSet>
</message:GenericData>

注意事项:我使用 cURL 发送请求,但这可以通过许多其他方式完成,例如使用 Wget、Perl、PHP 等。只要您愿意(并且能够)使用该系统() 命令,以编程方式从 Eurostat 将数据导入 R 应该足够简单(参见下面的编辑)。将数据转换为 ts 对象(或 mts 对象,具体取决于您发送的查询)也应该足够简单。最后,我使用的是 Linux 操作系统(Ubuntu 发行版),所以如果您使用的是 Windows,上面的示例可能不适合您。

我希望这有帮助!

编辑:我刚刚注意到你加载了 RCurl 包,所以如果你愿意,你可以用我使用的系统命令替换该包提供的任何工具。

于 2014-10-20T22:47:03.067 回答