0

我一直在努力将下面的代码转换为使用 *apply 系列函数,所以现在我向 StackOverflow 社区寻求一点帮助。一些背景知识,这是我正在开发的一种方法的一部分,用于分析三组的倾向评分方法。因此,我从代表每对组之间的距离(倾向得分差异)的三个矩阵开始。也就是说,矩阵 d1 是 A x B,d2 是 B x C,d3 是 C x A。我需要做的是找到最小化总距离并且小于某个卡尺的三元组。我已经尽可能地简化了这个例子,同时得到我想要的东西。

几点注意事项:

  • row1 <- row1[row1 < caliper]如果我要简单地创建所有可能组合的 data.frame(或矩阵),则可以在最后完成小于卡尺检查 ( ) 的距离。但是,即使我在这里设置的组数量很少,也会产生 3,000 行!

  • 在进行下一步之前,我对向量进行了排序。同样,如果我要拥有所有可能组合的矩阵,则可以消除这种情况。在我当前的版本中,我有另一行只查看 n 个最小的元素以减少执行时间。

  • 这个例子有相当小的组。我正在研究一个数据集,其中每个组有 5,000 到 8,000 个主题。

提前感谢您的帮助。我正在为此撰写一篇论文,并很乐意致谢。另外,我打算参加用户!在西班牙举行会议,将为任何有帮助的人买啤酒:-)

groups <- c('Control','Treat1','Treat2')
group.sizes <- c(15, 10, 20)
set.seed(2112)

d1 <- matrix(abs(rnorm(group.sizes[1] * group.sizes[2], mean=0, sd=1)), 
             nrow=group.sizes[1], ncol=group.sizes[2],
             dimnames=list(1:group.sizes[1], 
                          (group.sizes[1]+1):(group.sizes[1] + group.sizes[2])) )
d2 <- matrix(abs(rnorm(group.sizes[2] * group.sizes[3], mean=0, sd=1)), 
             nrow=group.sizes[2], ncol=group.sizes[3],
             dimnames=list((group.sizes[1]+1):(group.sizes[1] + group.sizes[2]), 
                          (group.sizes[2] + group.sizes[1] + 1):(sum(group.sizes)) ) )
d3 <- matrix(abs(rnorm(group.sizes[3] * group.sizes[1], mean=0, sd=1)), 
             nrow=group.sizes[3], ncol=group.sizes[1],
             dimnames=list((group.sizes[2] + group.sizes[1] + 1):(sum(group.sizes)), 
                          1:group.sizes[1]) )

caliper <- 1
results <- data.frame(v1=character(), v2=character(), v3=character(),
                      d1=numeric(), d2=numeric(), d3=numeric())
for(i1 in dimnames(d1)[[1]]) {
    row1 <- d1[i1,]
    row1 <- row1[row1 < caliper]
    row1 <- row1[order(row1)]
    for(i2 in names(row1)) {
        row2 <- d2[i2,]
        row2 <- row2[row2 < caliper]
        row2 <- row2[order(row2)]
        for(i3 in names(row2)) {
            val <- d3[i3,i1]
            if(val < caliper) {
                results <- rbind(results, 
                        data.frame(v1=i1, v2=i2, v3=i3,
                                d1=row1[i2], d2=row2[i3], d3=val))
            }
        }
    }
}
head(results)
4

1 回答 1

0

After some more work, I have figured out how to replace the three nested for loops with nested lapply function calls. To simplify testing the two approaches, I moved them to functions which are included below. This first chuck sets up the three matrices:

group.sizes <- c(15, 10, 20)
set.seed(2112)

d1 <- matrix(abs(rnorm(group.sizes[1] * group.sizes[2], mean=0, sd=1)), 
             nrow=group.sizes[1], ncol=group.sizes[2],
             dimnames=list(1:group.sizes[1], 
                          (group.sizes[1]+1):(group.sizes[1] + group.sizes[2])) )
d2 <- matrix(abs(rnorm(group.sizes[2] * group.sizes[3], mean=0, sd=1)), 
             nrow=group.sizes[2], ncol=group.sizes[3],
             dimnames=list((group.sizes[1]+1):(group.sizes[1] + group.sizes[2]), 
                          (group.sizes[2] + group.sizes[1] + 1):(sum(group.sizes)) ) )
d3 <- matrix(abs(rnorm(group.sizes[3] * group.sizes[1], mean=0, sd=1)), 
             nrow=group.sizes[3], ncol=group.sizes[1],
             dimnames=list((group.sizes[2] + group.sizes[1] + 1):(sum(group.sizes)), 
                          1:group.sizes[1]) )

Now the results with times

> system.time(results.forloops <- forloops(d1, d2, d3))
   user  system elapsed 
  2.129   0.370   2.530 
> system.time(results.apply <- nestedapply(d1, d2, d3))
   user  system elapsed 
  0.019   0.000   0.019 

Without much surprise, the lapply method is substantially faster, even with this small example. Warning, you can try this with larger matrices by changing the group.sizes factor above but the nested loops take a very long time to complete when making even small jumps in sizes.

Here are the functions:

forloops <- function(d1, d2, d3, caliper=1) {
    results <- data.frame(v1=character(), v2=character(), v3=character(),
                          d1=numeric(), d2=numeric(), d3=numeric())
    for(i1 in dimnames(d1)[[1]]) {
        row1 <- d1[i1,]
        row1 <- row1[row1 < caliper]
        #row1 <- row1[order(row1)]
        for(i2 in names(row1)) {
            row2 <- d2[i2,]
            row2 <- row2[row2 < caliper]
            #row2 <- row2[order(row2)]
            for(i3 in names(row2)) {
                val <- d3[i3,i1]
                if(val < caliper) {
                    results <- rbind(results, 
                                     data.frame(v1=i1, v2=i2, v3=i3,
                                               d1=row1[i2], d2=row2[i3], d3=val))
                }
            }
        }
    }
    results$total <- results$d1 + results$d2 + results$d3
    results <- results[order(results$total),]
    results <- results[!duplicated(results[,c('v1','v2')]), ]
    invisible(results)
}

nestedapply <- function(d1, d2, d3, caliper=1) {

    d1[d1 > caliper] <- NA
    d2[d2 > caliper] <- NA
    d3[d3 > caliper] <- NA

    results <- lapply(dimnames(d1)[[1]], FUN=function(i1) {
        row1 <- d1[i1,]
        row1 <- row1[!is.na(row1)]
        lapply(names(row1), FUN=function(i2) {
            row2 <- d2[i2,]
            row2 <- row2[!is.na(row2)]
            lapply(names(row2), FUN=function(i3) {
                val <- d3[i3,i1]
                if(is.na(val)) {
                    return(c())
                } else {
                    c(i1, i2, i3, row1[i2], row2[i3], val)
                }
            })
        })
    })
    results <- as.data.frame(matrix(unlist(results), ncol=6, byrow=TRUE), stringsAsFactors=FALSE)
    names(results) <- c('v1','v2','v3','d1','d2','d3')
    results$d1 <- as.numeric(results$d1)
    results$d2 <- as.numeric(results$d2)
    results$d3 <- as.numeric(results$d3)
    results$total <- results$d1 + results$d2 + results$d3
    invisible(results)
}
于 2013-02-11T01:14:14.263 回答