0

我正在尝试检索两个相同长度的向量,一个具有孩子的属性,第二个具有相应父母的属性。示例文件:

countries.xml <- "<country>
              <city id='1'>
                <place id='1.1'> xxx </place>
                <place id='1.2'> xxx </place>
                <place id='1.3'> xxx </place>
              </city>
              <city id='2'>
                <place id='2.1'> xxx </place>
                <place id='2.2'> xxx </place>
                <place id='2.3'> xxx </place>
              </city>
           </country>"

到目前为止我的代码

library("XML")
doc = xmlTreeParse(countries.xml, useInternalNodes = T)
xpathSApply(doc, path = "//city/place/@id")
xpathSApply(doc, path = "//city/place/parent::*/@id")

我希望最终得到这样的向量(命名)

"1.1" "1.2" "1.3" "2.1" "2.2" "2.3"
"1" "1" "1" "2" "2" "2"

但相反,第二条路径产生

"1" "2" 

我可以通过循环得到我想要的

library(glue)
place_id <- unname(xpathSApply(doc, path = "//city/place/@id"))
city_id <- vector()
for(i in place_id){
  city_id <- c(city_id,unname(xpathSApply(doc, path = glue("//city/place[@id={i}]/parent::*/@id"))))
}
city_id
"1" "1" "1" "2" "2" "2"

但它效率非常低,并且我正在处理的大型 xml.file 需要很长时间。我确信有一种方法可以通过正确的路径获得我需要的东西xpathSApply但找不到它,所以请有人启发我:)?

UPDATE @Wietze314 解决方案在我的简单示例中效果很好,但我无法将其调整为更复杂的 xml 文件。我确实设法更改了他的代码以处理以下示例

countries.xml <- "<continent>
          <country id='c1'>
          <city id='1'>
            <place id='1.1'> xxx </place>
            <place id='1.2'> xxx </place>
            <place id='1.3'> xxx </place>
          </city>
          <city id='2'>
            <place id='2.1'> xxx </place>
            <place id='2.2'> xxx </place>
            <place id='2.3'> xxx </place>
          </city>
       </country>
       <country id=c2'>
          <city id='1'>
            <place id='1.1'> xxx </place>
            <place id='1.2'> xxx </place>
            <place id='1.3'> xxx </place>
          </city>
          <city id='2'>
            <place id='2.1'> xxx </place>
            <place id='2.2'> xxx </place>
            <place id='2.3'> xxx </place>
          </city>
       </country>
        </continent>"

这段代码

    pmap_df(list(
  xml_children(cntry) %>% map(xml_children) %>% 
    map(xml_attr,'id') %>% unlist() %>% as.list() %>%
    map(~as_tibble(.) %>% select(city = value)),
    xml_children(cntry) %>% xml_children() %>% map(xml_children) %>% 
    map(xml_attr,'id') %>%
    map(~as_tibble(.) %>% select(place = value))),cbind)

返回这个

    city place
1     1   1.1
2     1   1.2
3     1   1.3
4     2   2.1
5     2   2.2
6     2   2.3
7     3   3.1
8     3   3.2
9     3   3.3
10    4   4.1
11    4   4.2
12    4   4.3

但是应用于我感兴趣的文件的相同代码失败:(有什么建议吗?

pfile <- http://nextbike.net/maps/nextbike-official.xml",
                  useInternalNodes = T)
pmap_df(list(
  xml_children(pfile) %>% map(xml_children) %>% 
    map(xml_attr,'uid') %>% unlist() %>% as.list() %>%
    map(~as_tibble(.) %>% select(city = value)),
  xml_children(pfile) %>% xml_children() %>% map(xml_children) %>% 
    map(xml_attr,'uid') %>%
    map(~as_tibble(.) %>% select(place = value))),cbind) 

Error in data.frame(..., check.names = FALSE) : 
      arguments imply differing number of rows: 1, 0
4

1 回答 1

1

一个解决tidyverse方案xml2

require(xml2)
require(tidyverse)

cntry <- read_xml(countries.xml)


pmap_df(list(
  xml_children(cntry) %>% map(xml_attr,'id') %>% 
    map(~as_tibble(.) %>% select(country = value)),
  xml_children(cntry) %>% map(xml_children) %>% 
    map(xml_attr,'id') %>% 
    map(~as_tibble(.) %>% select(place = value))
  ),cbind)

编辑:

我试图让它在超过 2 个级别上工作,但没有成功。这是我到目前为止提出的:

require(xml2)
require(tidyverse)

parsedxml <- read_xml(countries.xml)

get_ids <- function(xml){
  xml %>% xml_attr('id') %>% 
    map(~as_tibble(.))
}

country <- parsedxml %>% xml_children() %>% map(get_ids)
city <- parsedxml %>% xml_children() %>% map(~xml_children(.) %>% map(get_ids))
place <- parsedxml %>% xml_children() %>% map(~xml_children(.) %>% map(~xml_children(.) %>% map(get_ids)))

rbind(country[[1]],rbind(city[[1]][[1]],place[[1]][[1]])) %>% apply(1,unlist)

一个城市的结果

      [,1] [,2] [,3] 
value "c1" "1"  "1.1"
value "c1" "1"  "1.2"
value "c1" "1"  "1.3"

这个丑陋的代码将所有内容结合在一起:

do.call(rbind,lapply(1:2,function(x) 
  lapply(1:2,function(y) 
    rbind(country[[x]],rbind(city[[x]][[y]],place[[x]][[y]])) %>% apply(1,unlist)) %>% do.call(rbind,.)))

希望其他人对最后一部分有更好的解决方案。

于 2019-09-25T09:21:03.260 回答