xml - 在 R 中提取 XML 节点和属性

Question

我有一个如下所示的 XML 数据集：

<protocol ID='.'>
    <HEAD></HEAD>
    <block ID='...'>
        <HEAD></HEAD>
        <trial ID='.....'>
            <HEAD></HEAD>
            <seq ID=''>
                <HEAD></HEAD>
                <calibration CLASS='affine-calibration' ID='New Calibration'>
                    <AX>.........</AX>
                    <BX>-........</BX>
                    <AY>.........</AY>
                    <BY>.........</BY>
                    <type>'por'</type>
                </calibration>
                <POR TIME='......'>
                    <PUPIL>.</PUPIL>
                    <BLINK>.</BLINK>
                    <V>...</V>
                    <H>...</H>
                    <PLANEINTRWV>...</PLANEINTRWV>
                    <PLANEINTRWH>...</PLANEINTRWH>
                    <PLANE>.</PLANE>
                </POR>
                <POR TIME='......'>
                    <PUPIL>.</PUPIL>
                    <BLINK>.</BLINK>
                    <V>...</V>
                    <H>...</H>
                    <PLANEINTRWV>...</PLANEINTRWV>
                    <PLANEINTRWH>...</PLANEINTRWH>
                    <PLANE>.</PLANE>
                </POR>
                <POR TIME='......'>
                    <PUPIL>.</PUPIL>
                    <BLINK>.</BLINK>
                    <V>...</V>
                    <H>...</H>
                    <PLANEINTRWV>...</PLANEINTRWV>
                    <PLANEINTRWH>...</PLANEINTRWH>
                    <PLANE>.</PLANE>
                </POR>
            </seq>
        </trial>
        <trial ID='.....'>
            <HEAD></HEAD>
            <seq ID=''>
                <HEAD></HEAD>
                <calibration CLASS='affine-calibration' ID='New Calibration'>
                    <AX>.........</AX>
                    <BX>-........</BX>
                    <AY>.........</AY>
                    <BY>.........</BY>
                    <type>'por'</type>
                </calibration>
                <POR TIME='......'>
                    <PUPIL>.</PUPIL>
                    <BLINK>.</BLINK>
                    <V>...</V>
                    <H>...</H>
                    <PLANEINTRWV>...</PLANEINTRWV>
                    <PLANEINTRWH>...</PLANEINTRWH>
                    <PLANE>.</PLANE>
                </POR>
                <POR TIME='......'>
                    <PUPIL>.</PUPIL>
                    <BLINK>.</BLINK>
                    <V>...</V>
                    <H>...</H>
                    <PLANEINTRWV>...</PLANEINTRWV>
                    <PLANEINTRWH>...</PLANEINTRWH>
                    <PLANE>.</PLANE>
                </POR>
            </seq>
        </trial>
    </block>
</protocol>

使用 XML 包，提取 POR 标记的子标记和标记属性的最简洁方法是什么？

我把这个有用的东西拼凑在一起，但它很慢（由于很可能是 xpathSApply 调用）并且几乎不可读。

trackToDataFrame = function(file) {
    doc2=xmlParse(file)
    timeStamps = t(xpathSApply(doc2, '//*[@TIME]', function(x) c(name=xmlName(x), xmlAttrs(x))))
    dd2 = xmlToDataFrame(getNodeSet(doc2, "//POR"), colClasses=c(rep("integer", 7)))
    dd2 = cbind(dd2, timeStamps)
    dd2
}

调用数据集返回：

  PUPIL BLINK  V  H PLANEINTRWV PLANEINTRWH PLANE name   TIME
1    NA    NA NA NA          NA          NA    NA  POR ......
2    NA    NA NA NA          NA          NA    NA  POR ......
3    NA    NA NA NA          NA          NA    NA  POR ......
4    NA    NA NA NA          NA          NA    NA  POR ......
5    NA    NA NA NA          NA          NA    NA  POR ......

我认为整个事情可以通过一个 xmlToDataFrame 调用来完成，但我对 XML 包不够熟悉，无法让它工作。

我真正感兴趣的是“时间”列以及从 xmlToDataFrame 调用中提取的所有列。

score 15 · Accepted Answer

require(XML)
Fun1 <- function(xdata){
  dum <- xmlParse(xdata)
  xDf <- xmlToDataFrame(nodes = getNodeSet(dum, "//*/POR"), stringsAsFactors = FALSE)
  xattrs <- xpathSApply(dum, "//*/POR/@TIME")
  xDf$name <- "POR"
  xDf$TIME <- xattrs
  xDf
}

Fun2 <-function(xdata){
  dumFun <- function(x){
    xname <- xmlName(x)
    xattrs <- xmlAttrs(x)
    c(sapply(xmlChildren(x), xmlValue), name = xname, xattrs)
  }
  dum <- xmlParse(xdata)
  as.data.frame(t(xpathSApply(dum, "//*/POR", dumFun)), stringsAsFactors = FALSE)
}

> identical(Fun1(xdata), Fun2(xdata))
[1] TRUE

library(rbenchmark)

benchmark(Fun1(xdata), Fun2(xdata))

         test replications elapsed relative user.self sys.self user.child
1 Fun1(xdata)          100   1.047    2.069     1.044        0          0
2 Fun2(xdata)          100   0.506    1.000     0.504        0          0
  sys.child
1         0
2         0

score 2 · Accepted Answer

user1609452 的修改版本：

extractXML <-function(xdata, expr, transpo = T){

  # expr should be "//*/Array"
  # cat("[INFO] - expr is an expression of the path usually something like '//*/Array'.")
  # cat("\n[INFO] - Use Transpo = F is you do not want to transpose the output. Just try it out.\n\n")

  dumFun <- function(x){
    xname <- xmlName(x)
    xattrs <- xmlAttrs(x)
    c(sapply(xmlChildren(x), xmlValue), name = xname, xattrs)
  }

  dum <- xmlParse(xdata)
  listxml <- xpathSApply(dum, expr, dumFun)

  if( transpo == T ) {
    data <- as.data.table(t(listxml), stringsAsFactors = FALSE)
  } else {
    data <- as.data.table(rbind.fill(lapply(listxml,function(y){as.data.frame(y,stringsAsFactors=F)})))
  }

  return(data)
}

目标是在没有子项时获取属性（transpo = F），但您想获取属性。

下面的例子：

<Arrays>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam1" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam2" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam1" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam2" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam1" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam2" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam1" Type="Image"/>
    <Array Factor="1.000000" CompressionRate="" CompressionType="" BitsPerPixel="16" Height="515" Width="682" Name="Exp1Cam2" Type="Image" Description=""/>
</Arrays>


extractXML(xdata, "//*/Array", T)

       V1     V2     V3     V4     V5     V6     V7     V8
1: <list> <list> <list> <list> <list> <list> <list> <list>

extractXML(xdata, "//*/Array", F)

    name   Factor CompressionRate CompressionType BitsPerPixel Height Width     Name  Type Description
1: Array 1.000000                                           16    515   682 Exp1Cam1 Image          NA
2: Array 1.000000                                           16    515   682 Exp1Cam2 Image          NA
3: Array 1.000000                                           16    515   682 Exp1Cam1 Image          NA
4: Array 1.000000                                           16    515   682 Exp1Cam2 Image          NA
5: Array 1.000000                                           16    515   682 Exp1Cam1 Image          NA
6: Array 1.000000                                           16    515   682 Exp1Cam2 Image          NA
7: Array 1.000000                                           16    515   682 Exp1Cam1 Image          NA
8: Array 1.000000                                           16    515   682 Exp1Cam2 Image

xml - 在 R 中提取 XML 节点和属性

2 回答 2

Related

Reference