0

假设我有 5 个具有相同列但行长不同的数据框。我想制作 1 个数据帧,它从 5 个数据帧中的每一个中获取一个特定的列,并在没有长度匹配的地方填充 NA(或其他)。我在这里看到了一些问题,这些问题显示了如何使用一次性向量来做到这一点,但我正在寻找一种方法来处理更大的数据集。

例如:2个等长的数据帧:

long <- data.frame(accepted = rnorm(350, 2000), cost = rnorm(350,5000))
long2 <- data.frame(accepted = rnorm(350, 2000), cost = rnorm(350,5000))

我可以创建一个组合它们的列表,然后创建一个空数据框并使用列表中数据框的公共变量填充它:

list1 <- list(long, long2)
df1 <- as.data.frame(matrix(0, ncol = 5, nrow = 350))
df1[,1:2] <- sapply(list, '[[', 'accepted')

它有效。

但是当我有更多长度不等的数据帧时,这种方法会失败:

long <- data.frame(accepted = rnorm(350, 2000), cost = rnorm(350,5000))
long2 <- data.frame(accepted = rnorm(350, 2000), cost = rnorm(350,5000))
medlong <- data.frame(accepted = rnorm(300, 2000), cost = rnorm(300,5000))
medshort <- data.frame(accepted = rnorm(150, 2000), cost = rnorm(150,5000))
short <- data.frame(accepted = rnorm(50, 2000), cost = rnorm(50,5000))

现在制作列表和组合数据框:

list2 <- list(long, long2, medlong, medshort, short)
df2 <- as.data.frame(matrix(0, ncol = 5, nrow = 350))
df1[,1:5] <- sapply(list, '[[', 'accepted')

我收到有关尺寸不匹配的错误:

[<-.data.frame( *tmp*, , 1:5, value = c(1998.77096640377, : 替换有 700 个项目,需要 1750个

我发现用与其他数据帧长度不等的列填充此数据帧的唯一解决方案是:

combined.df <- as.data.frame(matrix(0, ncol = 5, nrow = 350))
combined.df[,1] <- long[,2]
combined.df[,2] <- c(medlong[,2], rep(NA, nrow(long) - nrow(medlong))

但是必须有一种更优雅、更快捷的方法来做到这一点……我知道我在这里遗漏了一些巨大的概念

4

3 回答 3

2

一种方法是找到最长列的长度,然后将较短的列与适当数量的 NA 连接起来。一种方法是这样的(对于 MWE,数据的大小更合理!)...

out <- lapply(  list1 , '[[', 'accepted')

#  Find length of longest column
len <- max( sapply( out , length ) )

# Stack shorter columns with NA at the end
dfs <- sapply( out , function(x) c( x , rep( NA , len - length(x) ) ) ) 

#  Make data.frame and set column names at same time
setNames( do.call( data.frame , dfs ) , paste0("V" , 1:length(out) ) )
          V1         V2          V3
1 -1.0913212 -2.4864497  0.04220331
2 -0.5252874  0.8030984  0.21774515
3  0.6914167  0.9685629  1.47159957
4         NA         NA -0.89809670
5         NA         NA  0.51140539
6         NA         NA -0.46833136
7         NA         NA -0.40085707
于 2013-10-21T12:43:24.793 回答
1

您可以尝试使用合并:

long$rn <- rownames(long)
long2$rn <- rownames(long2)
medlong$rn <- rownames(medlong)
medshort$rn <- rownames(medshort)
short$rn <- rownames(short)

result <- (merge(merge(merge(merge(
    long, long2[, cols], by=c('rn'), all=T),
    medlong[, cols], by=c('rn'), all=T),
    medshort[, cols], by=c('rn'), all=T),
    short[, cols], by=c('rn'), all=T))
于 2013-10-21T12:37:15.413 回答
1

您还可以“子集”每个数据帧df[nrow(df) + n,],以便插入NAs:

#dataframes of different rows
long <- data.frame(accepted = rnorm(15, 2000), cost = rnorm(15,5000))
long2 <- data.frame(accepted = rnorm(10, 2000), cost = rnorm(10,5000))
long3 <- data.frame(accepted = rnorm(12, 2000), cost = rnorm(12,5000))

#insert all dataframes in list to manipulate
myls <- list(long, long2, long3)

#maximum number of rows
max.rows <- max(nrow(long), nrow(long2), nrow(long3))

#insert the needed `NA`s to each dataframe
new_myls <- lapply(myls, function(x) { x[1:max.rows,] })

#create  wanted dataframe
do.call(cbind, lapply(new_myls, `[`, "accepted"))

#   accepted accepted accepted
#1  2001.581 1999.014 2001.810
#2  2000.071 2000.033 2000.588
#3  1999.931 2000.188 2000.833
#4  1998.467 1999.891 1997.645
#5  2000.682 2000.144 1999.639
#6  1999.693 1999.341 1998.959
#7  2000.222 1998.939 2002.271
#8  1999.104 1998.530 1997.600
#9  1998.435 2001.496 2001.129
#10 1998.160 2000.729 2001.602
#11 1999.267       NA 1999.733
#12 2000.048       NA 2001.431
#13 1999.504       NA       NA
#14 2000.660       NA       NA
#15 2000.160       NA       NA
于 2013-10-21T13:03:10.940 回答