1

我正在尝试从 R 中的数据框编写 XML 文件,但遇到了性能问题。

我有以下代码来编写 XML 文件,它适用于我的示例中的小数据帧。但是,我的真实数据框包含超过 50,000 行和 5 列。这需要10 多个小时来处理

我怎样才能提高性能?

require(XML)
products <- c('A','B','C')
location <- c(1,2,3) 
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)

data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
for (i in 3:4) {
element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df[j,i]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
 }
}

saveXML(data,file="test.xml")
4

1 回答 1

2

您在这里有几个问题正在减慢您的速度。首先,您的嵌套循环可能对您没有太大帮助。您可以通过重塑数据框来摆脱它:

require(XML)
require(reshape2)

products <- c('A','B','C')
location <- c(1,2,3) 
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)

df2 <- melt(df, id.vars = c("products", "location"))
df2[,sapply(df2, is.factor)] <- lapply(df2[,sapply(df2, is.factor)], as.character)

df2
  products location variable value
1        A        1     var1     1
2        B        2     var1     2
3        C        3     var1     3
4        A        1     var2     1
5        B        2     var2     2
6        C        3     var2     3

这样,您想要包含在 XML 中的每个指标都排列在自己的单独列中。

这将导致以下用于构建 XML 树的方法(包装在一个函数中以便稍后进行基准测试):

xml2 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df2)) {
    element = newXMLNode("element",attrs=c(guid=paste(df2$variable[j],df2$products[j],df2$location[j],sep="_")),parent=data2)
    name = newXMLNode("name", paste(df2$variable[j],df2$products[j],df2$location[j],sep=" "), parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", round(df2$value[j]),parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$variable[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$products[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df2$location[j]),parent=ref)
  }
  data2
}

除此之外,您不必要地调用该函数,并在您可以事先在数据帧上调用它们时as.character重复调用roundand函数:paste

df3 <- df2
df3$element <- paste(df3$variable,df3$products,df3$location, sep="_")
df3$name <- paste(df3$variable,df3$products,df3$location, sep=" ")
df3$value <- round(df3$value)

这导致:

xml3 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2)
    name = newXMLNode("name", df3$name[j], parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", df3$value[j],parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$variable[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$products[j]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=df3$location[j]),parent=ref)
  }
  data2
}

最后,您可以在以下调用中创建子节点newXMLNode

xml4 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed")))))
    meta = newXMLNode("meta",parent=element,
      .children = list(
        newXMLNode("ref", attrs=c('source-guid'="fs_items"),
          .children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
        newXMLNode("ref", attrs=c('source-guid'="products"),
          .children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
        newXMLNode("ref", attrs=c('source-guid'="location"),
          .children = newXMLNode("value", attrs=c(guid=df3$location[j])))))
  }
  data2
}

因此,如果我们采用您的原始流程:

xml1 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
  for (i in 3:4) {
    element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
    name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
    value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
    orig = newXMLNode("orig", round(df[j,i]),parent=value)
    processes = newXMLNode("processed",parent=value)
    meta = newXMLNode("meta",parent=element)
    ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
    ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
    value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
  }
}
  data2
}

并对其进行基准测试:

microbenchmark(xml1(), xml2(), xml3(), xml4())
Unit: milliseconds
   expr       min        lq    median        uq      max neval
 xml1() 100.43712 100.97356 101.52694 102.28243 367.6518   100
 xml2()  99.38772 100.02676 100.63210 101.19588 373.8043   100
 xml3()  98.91923  99.67163 100.22482 100.92313 394.2360   100
 xml4()  82.09688  82.60983  83.02559  83.64807 367.6711   100

重塑您的数据框并在该数据框上一次调用函数而不是在您的 for 循环中重复调用都有助于(非常)一点,但您真正的节省时间将是在调用中将孩子分配给父母newXMLNode。它仍然不会很快,但它应该比你正在做的更快。

编辑

如果您需要更快的速度,可以再折叠一点创建节点(在第一次调用 newXMLnode 时将“元”分配为“元素”的子级):

xml5 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed"))),
          newXMLNode("meta",
            .children = list(
              newXMLNode("ref", attrs=c('source-guid'="fs_items"),
                .children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
              newXMLNode("ref", attrs=c('source-guid'="products"),
                .children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
              newXMLNode("ref", attrs=c('source-guid'="location"),
                .children = newXMLNode("value", attrs=c(guid=df3$location[j])))))))
  }
  data2
}

然而,除此之外,您可能还需要重新评估您选择的构建 XML 文档本身的方式。例如,如果您在“ref”节点中包含“value”节点作为实际值(现在“ref”节点只有没有节点值的属性),则可以为循环的每次迭代消除对 newXMLNode 的三个调用:

xml6 <- function(...) {
  data = newXMLNode("data",attrs=c(guid="snapshot_data"))
  data2 = newXMLNode("data",parent=data)
  for (j in 1:nrow(df3)) {
    element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
      .children = 
        list(newXMLNode("name", df3$name[j]),
          newXMLNode("value", attrs=c(period="year", unit="Pure"),
            .children = list(newXMLNode("orig", df3$value[j]),
              newXMLNode("processed"))),
          newXMLNode("meta",
            .children = list(
              newXMLNode("ref", df3$variable[j], attrs=c('source-guid'="fs_items")),
              newXMLNode("ref", df3$products[j], attrs=c('source-guid'="products")),
              newXMLNode("ref", df3$location[j], attrs=c('source-guid'="location"))
            ))))
  }
  data2
}

简化 XML 文档的结构可以提高速度:

microbenchmark(xml1(), xml2(), xml3(), xml4(), xml5(), xml6())

Unit: milliseconds
   expr      min        lq    median        uq      max neval
 xml1() 99.66528 100.79417 101.09906 101.56140 393.4303   100
 xml2() 98.58393  99.68279  99.90569 100.64327 392.6561   100
 xml3() 98.26595  99.41217  99.65450 100.37495 363.4646   100
 xml4() 81.32157  82.33324  82.62350  82.96958 363.4569   100
 xml5() 78.89286  79.96670  80.14763  80.74278 346.1388   100
 xml6() 71.17018  72.05212  72.36548  72.81261 334.9638   100

这仍然不会将运行时间从几小时缩短到几分钟。如果你真的需要快速运行的东西,我会选择 R 以外的东西,它可以更快地处理循环。

于 2013-06-11T18:16:21.440 回答