您在这里有几个问题正在减慢您的速度。首先,您的嵌套循环可能对您没有太大帮助。您可以通过重塑数据框来摆脱它:
require(XML)
require(reshape2)
products <- c('A','B','C')
location <- c(1,2,3)
var1 <- c(1,2,3)
var2 <- c(1,2,3)
df <- data.frame(products, location, var1, var2)
df2 <- melt(df, id.vars = c("products", "location"))
df2[,sapply(df2, is.factor)] <- lapply(df2[,sapply(df2, is.factor)], as.character)
df2
products location variable value
1 A 1 var1 1
2 B 2 var1 2
3 C 3 var1 3
4 A 1 var2 1
5 B 2 var2 2
6 C 3 var2 3
这样,您想要包含在 XML 中的每个指标都排列在自己的单独列中。
这将导致以下用于构建 XML 树的方法(包装在一个函数中以便稍后进行基准测试):
xml2 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df2)) {
element = newXMLNode("element",attrs=c(guid=paste(df2$variable[j],df2$products[j],df2$location[j],sep="_")),parent=data2)
name = newXMLNode("name", paste(df2$variable[j],df2$products[j],df2$location[j],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df2$value[j]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$variable[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$products[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df2$location[j]),parent=ref)
}
data2
}
除此之外,您不必要地调用该函数,并在您可以事先在数据帧上调用它们时as.character
重复调用round
and函数:paste
df3 <- df2
df3$element <- paste(df3$variable,df3$products,df3$location, sep="_")
df3$name <- paste(df3$variable,df3$products,df3$location, sep=" ")
df3$value <- round(df3$value)
这导致:
xml3 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2)
name = newXMLNode("name", df3$name[j], parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", df3$value[j],parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$variable[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$products[j]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=df3$location[j]),parent=ref)
}
data2
}
最后,您可以在以下调用中创建子节点newXMLNode
:
xml4 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed")))))
meta = newXMLNode("meta",parent=element,
.children = list(
newXMLNode("ref", attrs=c('source-guid'="fs_items"),
.children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
newXMLNode("ref", attrs=c('source-guid'="products"),
.children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
newXMLNode("ref", attrs=c('source-guid'="location"),
.children = newXMLNode("value", attrs=c(guid=df3$location[j])))))
}
data2
}
因此,如果我们采用您的原始流程:
xml1 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df)) {
for (i in 3:4) {
element = newXMLNode("element",attrs=c(guid=paste(colnames(df) [i],df[j,1],df[j,2],sep="_")),parent=data2)
name = newXMLNode("name", paste(colnames(df) [i],df[j,1],df[j,2],sep=" "), parent=element)
value = newXMLNode("value", attrs=c(period="year", unit="Pure"),parent=element)
orig = newXMLNode("orig", round(df[j,i]),parent=value)
processes = newXMLNode("processed",parent=value)
meta = newXMLNode("meta",parent=element)
ref = newXMLNode("ref", attrs=c('source-guid'="fs_items"),parent=meta)
value = newXMLNode("value", attrs=c(guid=colnames(df) [i]),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="products"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,1])),parent=ref)
ref = newXMLNode("ref", attrs=c('source-guid'="location"),parent=meta)
value = newXMLNode("value", attrs=c(guid=as.character(df[j,2])),parent=ref)
}
}
data2
}
并对其进行基准测试:
microbenchmark(xml1(), xml2(), xml3(), xml4())
Unit: milliseconds
expr min lq median uq max neval
xml1() 100.43712 100.97356 101.52694 102.28243 367.6518 100
xml2() 99.38772 100.02676 100.63210 101.19588 373.8043 100
xml3() 98.91923 99.67163 100.22482 100.92313 394.2360 100
xml4() 82.09688 82.60983 83.02559 83.64807 367.6711 100
重塑您的数据框并在该数据框上一次调用函数而不是在您的 for 循环中重复调用都有助于(非常)一点,但您真正的节省时间将是在调用中将孩子分配给父母newXMLNode
。它仍然不会很快,但它应该比你正在做的更快。
编辑
如果您需要更快的速度,可以再折叠一点创建节点(在第一次调用 newXMLnode 时将“元”分配为“元素”的子级):
xml5 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed"))),
newXMLNode("meta",
.children = list(
newXMLNode("ref", attrs=c('source-guid'="fs_items"),
.children = newXMLNode("value", attrs=c(guid=df3$variable[j]))),
newXMLNode("ref", attrs=c('source-guid'="products"),
.children = newXMLNode("value", attrs=c(guid=df3$products[j]))),
newXMLNode("ref", attrs=c('source-guid'="location"),
.children = newXMLNode("value", attrs=c(guid=df3$location[j])))))))
}
data2
}
然而,除此之外,您可能还需要重新评估您选择的构建 XML 文档本身的方式。例如,如果您在“ref”节点中包含“value”节点作为实际值(现在“ref”节点只有没有节点值的属性),则可以为循环的每次迭代消除对 newXMLNode 的三个调用:
xml6 <- function(...) {
data = newXMLNode("data",attrs=c(guid="snapshot_data"))
data2 = newXMLNode("data",parent=data)
for (j in 1:nrow(df3)) {
element = newXMLNode("element",attrs=c(guid=df3$element[j]), parent=data2,
.children =
list(newXMLNode("name", df3$name[j]),
newXMLNode("value", attrs=c(period="year", unit="Pure"),
.children = list(newXMLNode("orig", df3$value[j]),
newXMLNode("processed"))),
newXMLNode("meta",
.children = list(
newXMLNode("ref", df3$variable[j], attrs=c('source-guid'="fs_items")),
newXMLNode("ref", df3$products[j], attrs=c('source-guid'="products")),
newXMLNode("ref", df3$location[j], attrs=c('source-guid'="location"))
))))
}
data2
}
简化 XML 文档的结构可以提高速度:
microbenchmark(xml1(), xml2(), xml3(), xml4(), xml5(), xml6())
Unit: milliseconds
expr min lq median uq max neval
xml1() 99.66528 100.79417 101.09906 101.56140 393.4303 100
xml2() 98.58393 99.68279 99.90569 100.64327 392.6561 100
xml3() 98.26595 99.41217 99.65450 100.37495 363.4646 100
xml4() 81.32157 82.33324 82.62350 82.96958 363.4569 100
xml5() 78.89286 79.96670 80.14763 80.74278 346.1388 100
xml6() 71.17018 72.05212 72.36548 72.81261 334.9638 100
这仍然不会将运行时间从几小时缩短到几分钟。如果你真的需要快速运行的东西,我会选择 R 以外的东西,它可以更快地处理循环。