0

我有两个小标题,每个小标题最多有 4 列。每个列名要么对两者通用,要么在其中一个或另一个中缺失。我需要将它们组合成一个有两行的小标题,并NA在它们丢失的列中。我需要一般地执行此操作,因此它适用于或多或少缺失的列。这是从两个示例网页生成小标题的代码;

library(tidyverse)
library(htmltab)

read_results <- function(filename) {
  doc <- read_file(filename)
  df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
  colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
  tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))

  colnames(tib) <- as.character(unlist(tib[1,]))
  tib = tib[-1,]
  r <- t(tib)
  return (r);
}

# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")

现在t2t4包含

> r2
     First Timer! New PB! PB stays at  <NA>
[1,] "58"         "11"    " 3"         " 4"
> r4
     First Timer! New PB! PB stays at 
[1,] "62"         "16"    "11"        

我想构建t_all

     First Timer! New PB! PB stays at  <NA>
     58           11        3           4
     62           16       11           0
4

1 回答 1

3

您的问题是 r2 的其中一列的名称为 NA。因此,大多数基于列名对类似矩阵的对象进行配对的函数都会失败。要解决它,请将此行添加到您的函数中:names(tib)[is.na(names(tib))] <- "Blank"

library(tidyverse)
library(htmltab)

read_results <- function(filename) {
  doc <- read_file(filename)
  df <- as_tibble(htmltab(doc=doc, which="//table[@id='results']"))
  colnames(df) <- c("pos", "name", "time", "age_cat", "age_grade", "gender", "gender_pos", "note", "total_runs")
  tib = t(as_tibble(df) %>% group_by(substr(note,1,12)) %>% summarise(number=n()))

  colnames(tib) <- as.character(unlist(tib[1,]))
  tib = tib[-1,]
  names(tib)[is.na(names(tib))] <- "Blank"  ## New Line
  r <- t(tib)
  return (r);
}

# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=2
r2 = read_results("results _ henleyonthames parkrun_2.html")
# saved from http://www.parkrun.org.uk/henleyonthames/results/weeklyresults/?runSeqNumber=4
r4 = read_results("results _ henleyonthames parkrun_4.html")

dplyr::bind_rows(as_data_frame(r2),as_data_frame(r4))

# A tibble: 2 × 4
  `First Timer!` `New PB!` `PB stays at ` Blank
           <chr>     <chr>          <chr> <chr>
1             58        11              3     4
2             62        16             11  <NA>
于 2017-08-11T21:57:36.277 回答